Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

[mv-sheeva.git] / net / packet / af_packet.c
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c

index 8298e676f5a015f58d1b6005cf85938f8c8e142a..91cb1d71f018d8b6b77c029dca1b99cb2855ef21 100644 (file)
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -61,6 +61,7 @@
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/slab.h>
+#include <linux/vmalloc.h>
  #include <net/net_namespace.h>
  #include <net/ip.h>
  #include <net/protocol.h>
@@ -163,8 +164,13 @@ struct packet_mreq_max {
  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
                 int closing, int tx_ring);
  
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+       char *buffer;
+};
+
  struct packet_ring_buffer {
-       char                    **pg_vec;
+       struct pgv              *pg_vec;
         unsigned int            head;
         unsigned int            frames_per_block;
         unsigned int            frame_size;
@@ -217,6 +223,13 @@ struct packet_skb_cb {
  
  #define PACKET_SKB_CB(__skb)   ((struct packet_skb_cb *)((__skb)->cb))
  
+static inline __pure struct page *pgv_to_page(void *addr)
+{
+       if (is_vmalloc_addr(addr))
+               return vmalloc_to_page(addr);
+       return virt_to_page(addr);
+}
+
  static void __packet_set_status(struct packet_sock *po, void *frame, int status)
  {
         union {
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
         switch (po->tp_version) {
         case TPACKET_V1:
                 h.h1->tp_status = status;
-               flush_dcache_page(virt_to_page(&h.h1->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                 break;
         case TPACKET_V2:
                 h.h2->tp_status = status;
-               flush_dcache_page(virt_to_page(&h.h2->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                 break;
         default:
                 pr_err("TPACKET version not supported\n");
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
         h.raw = frame;
         switch (po->tp_version) {
         case TPACKET_V1:
-               flush_dcache_page(virt_to_page(&h.h1->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                 return h.h1->tp_status;
         case TPACKET_V2:
-               flush_dcache_page(virt_to_page(&h.h2->tp_status));
+               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                 return h.h2->tp_status;
         default:
                 pr_err("TPACKET version not supported\n");
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
         pg_vec_pos = position / rb->frames_per_block;
         frame_offset = position % rb->frames_per_block;
  
-       h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+       h.raw = rb->pg_vec[pg_vec_pos].buffer +
+               (frame_offset * rb->frame_size);
  
         if (status != __packet_get_status(po, h.raw))
                 return NULL;
@@ -503,7 +517,8 @@ out_free:
         return err;
  }
  
-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+static inline unsigned int run_filter(const struct sk_buff *skb,
+                                     const struct sock *sk,
                                       unsigned int res)
  {
         struct sk_filter *filter;
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
         rcu_read_lock_bh();
         filter = rcu_dereference_bh(sk->sk_filter);
         if (filter != NULL)
-               res = sk_run_filter(skb, filter->insns, filter->len);
+               res = sk_run_filter(skb, filter->insns);
         rcu_read_unlock_bh();
  
         return res;
  }
  
  /*
-   This function makes lazy skb cloning in hope that most of packets
-   are discarded by BPF.
-
-   Note tricky part: we DO mangle shared skb! skb->data, skb->len
-   and skb->cb are mangled. It works because (and until) packets
-   falling here are owned by current CPU. Output packets are cloned
-   by dev_queue_xmit_nit(), input packets are processed by net_bh
-   sequencially, so that if we return skb to original state on exit,
-   we will not harm anyone.
+ * This function makes lazy skb cloning in hope that most of packets
+ * are discarded by BPF.
+ *
+ * Note tricky part: we DO mangle shared skb! skb->data, skb->len
+ * and skb->cb are mangled. It works because (and until) packets
+ * falling here are owned by current CPU. Output packets are cloned
+ * by dev_queue_xmit_nit(), input packets are processed by net_bh
+ * sequencially, so that if we return skb to original state on exit,
+ * we will not harm anyone.
   */
  
  static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
  
         if (dev->header_ops) {
                 /* The device has an explicit notion of ll header,
-                  exported to higher levels.
-
-                  Otherwise, the device hides datails of it frame
-                  structure, so that corresponding packet head
-                  never delivered to user.
+                * exported to higher levels.
+                *
+                * Otherwise, the device hides details of its frame
+                * structure, so that corresponding packet head is
+                * never delivered to user.
                  */
                 if (sk->sk_type != SOCK_DGRAM)
                         skb_push(skb, skb->data - skb_mac_header(skb));
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
  
         __packet_set_status(po, h.raw, status);
         smp_mb();
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
         {
-               struct page *p_start, *p_end;
-               u8 *h_end = h.raw + macoff + snaplen - 1;
-
-               p_start = virt_to_page(h.raw);
-               p_end = virt_to_page(h_end);
-               while (p_start <= p_end) {
-                       flush_dcache_page(p_start);
-                       p_start++;
-               }
+               u8 *start, *end;
+
+               end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
+               for (start = h.raw; start < end; start += PAGE_SIZE)
+                       flush_dcache_page(pgv_to_page(start));
         }
+#endif
  
         sk->sk_data_ready(sk, 0);
  
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
         }
  
         err = -EFAULT;
-       page = virt_to_page(data);
         offset = offset_in_page(data);
         len_max = PAGE_SIZE - offset;
         len = ((to_write > len_max) ? len_max : to_write);
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                         return -EFAULT;
                 }
  
+               page = pgv_to_page(data);
+               data += len;
                 flush_dcache_page(page);
                 get_page(page);
-               skb_fill_page_desc(skb,
-                               nr_frags,
-                               page++, offset, len);
+               skb_fill_page_desc(skb, nr_frags, page, offset, len);
                 to_write -= len;
                 offset = 0;
                 len_max = PAGE_SIZE;
@@ -1638,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
  
                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-                       vnet_hdr.csum_start = skb->csum_start -
-                                                       skb_headroom(skb);
+                       vnet_hdr.csum_start = skb_checksum_start_offset(skb);
                         vnet_hdr.csum_offset = skb->csum_offset;
                 } /* else everything is zero */
  
@@ -2325,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = {
         .close  =       packet_mm_close,
  };
  
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+                       unsigned int len)
  {
         int i;
  
         for (i = 0; i < len; i++) {
-               if (likely(pg_vec[i]))
-                       free_pages((unsigned long) pg_vec[i], order);
+               if (likely(pg_vec[i].buffer)) {
+                       if (is_vmalloc_addr(pg_vec[i].buffer))
+                               vfree(pg_vec[i].buffer);
+                       else
+                               free_pages((unsigned long)pg_vec[i].buffer,
+                                          order);
+                       pg_vec[i].buffer = NULL;
+               }
         }
         kfree(pg_vec);
  }
  
  static inline char *alloc_one_pg_vec_page(unsigned long order)
  {
-       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+       char *buffer = NULL;
+       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+                         __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
  
-       return (char *) __get_free_pages(gfp_flags, order);
+       buffer = (char *) __get_free_pages(gfp_flags, order);
+
+       if (buffer)
+               return buffer;
+
+       /*
+        * __get_free_pages failed, fall back to vmalloc
+        */
+       buffer = vzalloc((1 << order) * PAGE_SIZE);
+
+       if (buffer)
+               return buffer;
+
+       /*
+        * vmalloc failed, lets dig into swap here
+        */
+       gfp_flags &= ~__GFP_NORETRY;
+       buffer = (char *)__get_free_pages(gfp_flags, order);
+       if (buffer)
+               return buffer;
+
+       /*
+        * complete and utter failure
+        */
+       return NULL;
  }
  
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
  {
         unsigned int block_nr = req->tp_block_nr;
-       char **pg_vec;
+       struct pgv *pg_vec;
         int i;
  
-       pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+       pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
         if (unlikely(!pg_vec))
                 goto out;
  
         for (i = 0; i < block_nr; i++) {
-               pg_vec[i] = alloc_one_pg_vec_page(order);
-               if (unlikely(!pg_vec[i]))
+               pg_vec[i].buffer = alloc_one_pg_vec_page(order);
+               if (unlikely(!pg_vec[i].buffer))
                         goto out_free_pgvec;
         }
  
@@ -2371,7 +2415,7 @@ out_free_pgvec:
  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
                 int closing, int tx_ring)
  {
-       char **pg_vec = NULL;
+       struct pgv *pg_vec = NULL;
         struct packet_sock *po = pkt_sk(sk);
         int was_running, order = 0;
         struct packet_ring_buffer *rb;
@@ -2456,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
         mutex_lock(&po->pg_vec_lock);
         if (closing || atomic_read(&po->mapped) == 0) {
                 err = 0;
-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
                 spin_lock_bh(&rb_queue->lock);
-               pg_vec = XC(rb->pg_vec, pg_vec);
+               swap(rb->pg_vec, pg_vec);
                 rb->frame_max = (req->tp_frame_nr - 1);
                 rb->head = 0;
                 rb->frame_size = req->tp_frame_size;
                 spin_unlock_bh(&rb_queue->lock);
  
-               order = XC(rb->pg_vec_order, order);
-               req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
+               swap(rb->pg_vec_order, order);
+               swap(rb->pg_vec_len, req->tp_block_nr);
  
                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
                                                 tpacket_rcv : packet_rcv;
                 skb_queue_purge(rb_queue);
-#undef XC
                 if (atomic_read(&po->mapped))
                         pr_err("packet_mmap: vma is busy: %d\n",
                                atomic_read(&po->mapped));
@@ -2533,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock,
                         continue;
  
                 for (i = 0; i < rb->pg_vec_len; i++) {
-                       struct page *page = virt_to_page(rb->pg_vec[i]);
+                       struct page *page;
+                       void *kaddr = rb->pg_vec[i].buffer;
                         int pg_num;
  
-                       for (pg_num = 0; pg_num < rb->pg_vec_pages;
-                                       pg_num++, page++) {
+                       for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
+                               page = pgv_to_page(kaddr);
                                 err = vm_insert_page(vma, start, page);
                                 if (unlikely(err))
                                         goto out;
                                 start += PAGE_SIZE;
+                               kaddr += PAGE_SIZE;
                         }
                 }
         }