};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
+/*
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
+ * tail pointer for that CPU's input queue at the time of last enqueue.
+ */
+struct rps_dev_flow {
+ u16 cpu;
+ u16 fill;
+ unsigned int last_qtail;
+};
+
+/*
+ * The rps_dev_flow_table structure contains a table of flow mappings.
+ */
+struct rps_dev_flow_table {
+ unsigned int mask;
+ struct rcu_head rcu;
+ struct work_struct free_work;
+ struct rps_dev_flow flows[0];
+};
+#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
+ (_num * sizeof(struct rps_dev_flow)))
+
+/*
+ * The rps_sock_flow_table contains mappings of flows to the last CPU
+ * on which they were processed by the application (set in recvmsg).
+ */
+struct rps_sock_flow_table {
+ unsigned int mask;
+ u16 ents[0];
+};
+#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
+ (_num * sizeof(u16)))
+
+#define RPS_NO_CPU 0xffff
+
+static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
+ u32 hash)
+{
+ if (table && hash) {
+ unsigned int cpu, index = hash & table->mask;
+
+ /* We only give a hint, preemption can change cpu under us */
+ cpu = raw_smp_processor_id();
+
+ if (table->ents[index] != cpu)
+ table->ents[index] = cpu;
+ }
+}
+
+static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
+ u32 hash)
+{
+ if (table && hash)
+ table->ents[hash & table->mask] = RPS_NO_CPU;
+}
+
+extern struct rps_sock_flow_table *rps_sock_flow_table;
+
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct rps_map *rps_map;
+ struct rps_dev_flow_table *rps_flow_table;
struct kobject kobj;
struct netdev_rx_queue *first;
atomic_t count;
} ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_RPS */
/*
* This structure defines the management hooks for network devices.
/* Elements below can be accessed between CPUs for RPS */
#ifdef CONFIG_RPS
struct call_single_data csd ____cacheline_aligned_in_smp;
+ unsigned int input_queue_head;
#endif
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};
+static inline void incr_input_queue_head(struct softnet_data *queue)
+{
+#ifdef CONFIG_RPS
+ queue->input_queue_head++;
+#endif
+}
+
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
#define HAVE_NETIF_QUEUE
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
+#include <linux/netdevice.h>
#include <net/flow.h>
#include <net/sock.h>
* @uc_ttl - Unicast TTL
* @inet_sport - Source port
* @inet_id - ID counter for DF pkts
+ * @rxhash - flow hash received from netif layer
* @tos - TOS
* @mc_ttl - Multicasting TTL
* @is_icsk - is this an inet_connection_sock?
__u16 cmsg_flags;
__be16 inet_sport;
__u16 inet_id;
+#ifdef CONFIG_RPS
+ __u32 rxhash;
+#endif
struct ip_options *opt;
__u8 tos;
return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0;
}
+static inline void inet_rps_record_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+ struct rps_sock_flow_table *sock_flow_table;
+
+ rcu_read_lock();
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ rps_record_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
+ rcu_read_unlock();
+#endif
+}
+
+static inline void inet_rps_reset_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+ struct rps_sock_flow_table *sock_flow_table;
+
+ rcu_read_lock();
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ rps_reset_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
+ rcu_read_unlock();
+#endif
+}
+
+static inline void inet_rps_save_rxhash(const struct sock *sk, u32 rxhash)
+{
+#ifdef CONFIG_RPS
+ if (unlikely(inet_sk(sk)->rxhash != rxhash)) {
+ inet_rps_reset_flow(sk);
+ inet_sk(sk)->rxhash = rxhash;
+ }
+#endif
+}
#endif /* _INET_SOCK_H */
DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
* rcu_read_lock must be held on entry.
*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
{
struct ipv6hdr *ip6;
struct iphdr *ip;
struct netdev_rx_queue *rxqueue;
struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u8 ip_proto;
+ u16 tcpu;
u32 addr1, addr2, ports, ihl;
if (skb_rx_queue_recorded(skb)) {
} else
rxqueue = dev->_rx;
- if (!rxqueue->rps_map)
+ if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
goto done;
if (skb->rxhash)
skb->rxhash = 1;
got_hash:
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ u16 next_cpu;
+ struct rps_dev_flow *rflow;
+
+ rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ tcpu = rflow->cpu;
+
+ next_cpu = sock_flow_table->ents[skb->rxhash &
+ sock_flow_table->mask];
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu != RPS_NO_CPU)
+ rflow->last_qtail = per_cpu(softnet_data,
+ tcpu).input_queue_head;
+ }
+ if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
__napi_schedule(&queue->backlog);
__get_cpu_var(netdev_rx_stat).received_rps++;
}
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_RPS */
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
-static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
{
struct softnet_data *queue;
unsigned long flags;
if (queue->input_pkt_queue.qlen) {
enqueue:
__skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_RPS
+ *qtail = queue->input_queue_head +
+ queue->input_pkt_queue.qlen;
+#endif
rps_unlock(queue);
local_irq_restore(flags);
return NET_RX_SUCCESS;
cpu_set(cpu, rcpus->mask[rcpus->select]);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
- } else
- __napi_schedule(&queue->backlog);
-#else
- __napi_schedule(&queue->backlog);
+ goto enqueue;
+ }
#endif
+ __napi_schedule(&queue->backlog);
}
goto enqueue;
}
#ifdef CONFIG_RPS
{
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
rcu_read_lock();
- cpu = get_rps_cpu(skb->dev, skb);
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
- ret = enqueue_to_backlog(skb, cpu);
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
rcu_read_unlock();
}
#else
- ret = enqueue_to_backlog(skb, get_cpu());
- put_cpu();
+ {
+ unsigned int qtail;
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+ put_cpu();
+ }
#endif
return ret;
}
int netif_receive_skb(struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- int cpu;
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu, ret;
+
+ rcu_read_lock();
- cpu = get_rps_cpu(skb->dev, skb);
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
- if (cpu < 0)
- return __netif_receive_skb(skb);
- else
- return enqueue_to_backlog(skb, cpu);
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ ret = __netif_receive_skb(skb);
+ }
+
+ return ret;
#else
return __netif_receive_skb(skb);
#endif
if (skb->dev == dev) {
__skb_unlink(skb, &queue->input_pkt_queue);
kfree_skb(skb);
+ incr_input_queue_head(queue);
}
rps_unlock(queue);
}
local_irq_enable();
break;
}
+ incr_input_queue_head(queue);
rps_unlock(queue);
local_irq_enable();
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
+ incr_input_queue_head(oldsd);
+ }
return NOTIFY_OK;
}
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/wireless.h>
+#include <linux/vmalloc.h>
#include <net/wext.h>
#include "net-sysfs.h"
return len;
}
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned int val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%u\n", val);
+}
+
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+ struct rps_dev_flow_table *table = container_of(work,
+ struct rps_dev_flow_table, free_work);
+
+ vfree(table);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+
+ INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+ schedule_work(&table->free_work);
+}
+
+ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int count;
+ char *endp;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ count = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ if (count) {
+ int i;
+
+ if (count > 1<<30) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+ count = roundup_pow_of_two(count);
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = count - 1;
+ for (i = 0; i < count; i++)
+ table->flows[i].cpu = RPS_NO_CPU;
+ } else
+ table = NULL;
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = queue->rps_flow_table;
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
static struct rx_queue_attribute rps_cpus_attribute =
__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+ __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+
static struct attribute *rx_queue_default_attrs[] = {
&rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
NULL
};
static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
- struct rps_map *map = queue->rps_map;
struct netdev_rx_queue *first = queue->first;
- if (map)
- call_rcu(&map->rcu, rps_map_release);
+ if (queue->rps_map)
+ call_rcu(&queue->rps_map->rcu, rps_map_release);
+
+ if (queue->rps_flow_table)
+ call_rcu(&queue->rps_flow_table->rcu,
+ rps_dev_flow_table_release);
if (atomic_dec_and_test(&first->count))
kfree(first);
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/sock.h>
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rps_sock_flow_table;
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<30) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_RPS
+ {
+ .procname = "rps_sock_flow_entries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rps_sock_flow_sysctl
+ },
+#endif
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
if (sk) {
long timeout;
+ inet_rps_reset_flow(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
{
struct sock *sk = sock->sk;
+ inet_rps_record_flow(sk);
+
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
}
EXPORT_SYMBOL(inet_sendmsg);
-
static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags)
{
struct sock *sk = sock->sk;
+ inet_rps_record_flow(sk);
+
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
return sock_no_sendpage(sock, page, offset, size, flags);
}
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ inet_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
skb->dev = NULL;
+ inet_rps_save_rxhash(sk, skb->rxhash);
+
bh_lock_sock_nested(sk);
ret = 0;
if (!sock_owned_by_user(sk)) {
sk->sk_state = TCP_CLOSE;
inet->inet_daddr = 0;
inet->inet_dport = 0;
+ inet_rps_save_rxhash(sk, 0);
sk->sk_bound_dev_if = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int rc = sock_queue_rcv_skb(sk, skb);
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr)
+ inet_rps_save_rxhash(sk, skb->rxhash);
+ rc = sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);