]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - net/packet/af_packet.c
Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[mv-sheeva.git] / net / packet / af_packet.c
index 8298e676f5a015f58d1b6005cf85938f8c8e142a..91cb1d71f018d8b6b77c029dca1b99cb2855ef21 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -163,8 +164,13 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
                int closing, int tx_ring);
 
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+       char *buffer;
+};
+
 struct packet_ring_buffer {
-       char                    **pg_vec;
+       struct pgv              *pg_vec;
        unsigned int            head;
        unsigned int            frames_per_block;
        unsigned int            frame_size;
@@ -217,6 +223,13 @@ struct packet_skb_cb {
 
 #define PACKET_SKB_CB(__skb)   ((struct packet_skb_cb *)((__skb)->cb))
 
+static inline __pure struct page *pgv_to_page(void *addr)
+{
+       if (is_vmalloc_addr(addr))
+               return vmalloc_to_page(addr);
+       return virt_to_page(addr);
+}
+
 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 {
        union {
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
        switch (po->tp_version) {
        case TPACKET_V1:
                h.h1->tp_status = status;
-               flush_dcache_page(virt_to_page(&h.h1->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                break;
        case TPACKET_V2:
                h.h2->tp_status = status;
-               flush_dcache_page(virt_to_page(&h.h2->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                break;
        default:
                pr_err("TPACKET version not supported\n");
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
-               flush_dcache_page(virt_to_page(&h.h1->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                return h.h1->tp_status;
        case TPACKET_V2:
-               flush_dcache_page(virt_to_page(&h.h2->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                return h.h2->tp_status;
        default:
                pr_err("TPACKET version not supported\n");
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
        pg_vec_pos = position / rb->frames_per_block;
        frame_offset = position % rb->frames_per_block;
 
-       h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+       h.raw = rb->pg_vec[pg_vec_pos].buffer +
+               (frame_offset * rb->frame_size);
 
        if (status != __packet_get_status(po, h.raw))
                return NULL;
@@ -503,7 +517,8 @@ out_free:
        return err;
 }
 
-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+static inline unsigned int run_filter(const struct sk_buff *skb,
+                                     const struct sock *sk,
                                      unsigned int res)
 {
        struct sk_filter *filter;
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
        rcu_read_lock_bh();
        filter = rcu_dereference_bh(sk->sk_filter);
        if (filter != NULL)
-               res = sk_run_filter(skb, filter->insns, filter->len);
+               res = sk_run_filter(skb, filter->insns);
        rcu_read_unlock_bh();
 
        return res;
 }
 
 /*
  This function makes lazy skb cloning in hope that most of packets
  are discarded by BPF.
-
  Note tricky part: we DO mangle shared skb! skb->data, skb->len
  and skb->cb are mangled. It works because (and until) packets
  falling here are owned by current CPU. Output packets are cloned
  by dev_queue_xmit_nit(), input packets are processed by net_bh
  sequencially, so that if we return skb to original state on exit,
  we will not harm anyone.
* This function makes lazy skb cloning in hope that most of packets
* are discarded by BPF.
+ *
* Note tricky part: we DO mangle shared skb! skb->data, skb->len
* and skb->cb are mangled. It works because (and until) packets
* falling here are owned by current CPU. Output packets are cloned
* by dev_queue_xmit_nit(), input packets are processed by net_bh
* sequencially, so that if we return skb to original state on exit,
* we will not harm anyone.
  */
 
 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
        if (dev->header_ops) {
                /* The device has an explicit notion of ll header,
-                  exported to higher levels.
-
-                  Otherwise, the device hides datails of it frame
-                  structure, so that corresponding packet head
-                  never delivered to user.
+                * exported to higher levels.
+                *
+                * Otherwise, the device hides details of its frame
+                * structure, so that corresponding packet head is
+                * never delivered to user.
                 */
                if (sk->sk_type != SOCK_DGRAM)
                        skb_push(skb, skb->data - skb_mac_header(skb));
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 
        __packet_set_status(po, h.raw, status);
        smp_mb();
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        {
-               struct page *p_start, *p_end;
-               u8 *h_end = h.raw + macoff + snaplen - 1;
-
-               p_start = virt_to_page(h.raw);
-               p_end = virt_to_page(h_end);
-               while (p_start <= p_end) {
-                       flush_dcache_page(p_start);
-                       p_start++;
-               }
+               u8 *start, *end;
+
+               end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
+               for (start = h.raw; start < end; start += PAGE_SIZE)
+                       flush_dcache_page(pgv_to_page(start));
        }
+#endif
 
        sk->sk_data_ready(sk, 0);
 
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
        }
 
        err = -EFAULT;
-       page = virt_to_page(data);
        offset = offset_in_page(data);
        len_max = PAGE_SIZE - offset;
        len = ((to_write > len_max) ? len_max : to_write);
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                        return -EFAULT;
                }
 
+               page = pgv_to_page(data);
+               data += len;
                flush_dcache_page(page);
                get_page(page);
-               skb_fill_page_desc(skb,
-                               nr_frags,
-                               page++, offset, len);
+               skb_fill_page_desc(skb, nr_frags, page, offset, len);
                to_write -= len;
                offset = 0;
                len_max = PAGE_SIZE;
@@ -1638,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-                       vnet_hdr.csum_start = skb->csum_start -
-                                                       skb_headroom(skb);
+                       vnet_hdr.csum_start = skb_checksum_start_offset(skb);
                        vnet_hdr.csum_offset = skb->csum_offset;
                } /* else everything is zero */
 
@@ -2325,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = {
        .close  =       packet_mm_close,
 };
 
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+                       unsigned int len)
 {
        int i;
 
        for (i = 0; i < len; i++) {
-               if (likely(pg_vec[i]))
-                       free_pages((unsigned long) pg_vec[i], order);
+               if (likely(pg_vec[i].buffer)) {
+                       if (is_vmalloc_addr(pg_vec[i].buffer))
+                               vfree(pg_vec[i].buffer);
+                       else
+                               free_pages((unsigned long)pg_vec[i].buffer,
+                                          order);
+                       pg_vec[i].buffer = NULL;
+               }
        }
        kfree(pg_vec);
 }
 
 static inline char *alloc_one_pg_vec_page(unsigned long order)
 {
-       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+       char *buffer = NULL;
+       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+                         __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
 
-       return (char *) __get_free_pages(gfp_flags, order);
+       buffer = (char *) __get_free_pages(gfp_flags, order);
+
+       if (buffer)
+               return buffer;
+
+       /*
+        * __get_free_pages failed, fall back to vmalloc
+        */
+       buffer = vzalloc((1 << order) * PAGE_SIZE);
+
+       if (buffer)
+               return buffer;
+
+       /*
+        * vmalloc failed, lets dig into swap here
+        */
+       gfp_flags &= ~__GFP_NORETRY;
+       buffer = (char *)__get_free_pages(gfp_flags, order);
+       if (buffer)
+               return buffer;
+
+       /*
+        * complete and utter failure
+        */
+       return NULL;
 }
 
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 {
        unsigned int block_nr = req->tp_block_nr;
-       char **pg_vec;
+       struct pgv *pg_vec;
        int i;
 
-       pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+       pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
        if (unlikely(!pg_vec))
                goto out;
 
        for (i = 0; i < block_nr; i++) {
-               pg_vec[i] = alloc_one_pg_vec_page(order);
-               if (unlikely(!pg_vec[i]))
+               pg_vec[i].buffer = alloc_one_pg_vec_page(order);
+               if (unlikely(!pg_vec[i].buffer))
                        goto out_free_pgvec;
        }
 
@@ -2371,7 +2415,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
                int closing, int tx_ring)
 {
-       char **pg_vec = NULL;
+       struct pgv *pg_vec = NULL;
        struct packet_sock *po = pkt_sk(sk);
        int was_running, order = 0;
        struct packet_ring_buffer *rb;
@@ -2456,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
        mutex_lock(&po->pg_vec_lock);
        if (closing || atomic_read(&po->mapped) == 0) {
                err = 0;
-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
                spin_lock_bh(&rb_queue->lock);
-               pg_vec = XC(rb->pg_vec, pg_vec);
+               swap(rb->pg_vec, pg_vec);
                rb->frame_max = (req->tp_frame_nr - 1);
                rb->head = 0;
                rb->frame_size = req->tp_frame_size;
                spin_unlock_bh(&rb_queue->lock);
 
-               order = XC(rb->pg_vec_order, order);
-               req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
+               swap(rb->pg_vec_order, order);
+               swap(rb->pg_vec_len, req->tp_block_nr);
 
                rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
                po->prot_hook.func = (po->rx_ring.pg_vec) ?
                                                tpacket_rcv : packet_rcv;
                skb_queue_purge(rb_queue);
-#undef XC
                if (atomic_read(&po->mapped))
                        pr_err("packet_mmap: vma is busy: %d\n",
                               atomic_read(&po->mapped));
@@ -2533,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock,
                        continue;
 
                for (i = 0; i < rb->pg_vec_len; i++) {
-                       struct page *page = virt_to_page(rb->pg_vec[i]);
+                       struct page *page;
+                       void *kaddr = rb->pg_vec[i].buffer;
                        int pg_num;
 
-                       for (pg_num = 0; pg_num < rb->pg_vec_pages;
-                                       pg_num++, page++) {
+                       for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
+                               page = pgv_to_page(kaddr);
                                err = vm_insert_page(vma, start, page);
                                if (unlikely(err))
                                        goto out;
                                start += PAGE_SIZE;
+                               kaddr += PAGE_SIZE;
                        }
                }
        }