From: Patrick McHardy Date: Wed, 3 Jan 2007 23:38:10 +0000 (+0100) Subject: NET_SCHED: Fix fallout from dev->qdisc RCU change X-Git-Tag: v2.6.16.38-rc1~42 X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=83d285a27720a4927ad1ca8e12b035ddcf1b5e38;p=karo-tx-linux.git NET_SCHED: Fix fallout from dev->qdisc RCU change The move of qdisc destruction to a rcu callback broke locking in the entire qdisc layer by invalidating previously valid assumptions about the context in which changes to the qdisc tree occur. The two assumptions were: - since changes only happen in process context, read_lock doesn't need bottem half protection. Now invalid since destruction of inner qdiscs, classifiers, actions and estimators happens in the RCU callback unless they're manually deleted, resulting in dead-locks when read_lock in process context is interrupted by write_lock_bh in bottem half context. - since changes only happen under the RTNL, no additional locking is necessary for data not used during packet processing (f.e. u32_list). Again, since destruction now happens in the RCU callback, this assumption is not valid anymore, causing races while using this data, which can result in corruption or use-after-free. Instead of "fixing" this by disabling bottem halfs everywhere and adding new locks/refcounting, this patch makes these assumptions valid again by moving destruction back to process context. Since only the dev->qdisc pointer is protected by RCU, but ->enqueue and the qdisc tree are still protected by dev->qdisc_lock, destruction of the tree can be performed immediately and only the final free needs to happen in the rcu callback to make sure dev_queue_xmit doesn't access already freed memory. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller Signed-off-by: Adrian Bunk --- diff --git a/net/core/dev.c b/net/core/dev.c index 7405b3ffd338..caa8cbb0e0ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1297,14 +1297,16 @@ int dev_queue_xmit(struct sk_buff *skb) if (q->enqueue) { /* Grab device queue */ spin_lock(&dev->queue_lock); + q = dev->qdisc; + if (q->enqueue) { + rc = q->enqueue(skb, q); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); - rc = q->enqueue(skb, q); - - qdisc_run(dev); - + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; } /* The device has no queue. Common case for software devices: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b4d89fbb3782..8c237c13bd8d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -402,7 +402,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) return skb->len; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); if (!tcm->tcm_parent) q = dev->qdisc_sleeping; else @@ -459,7 +459,7 @@ errout: if (cl) cops->put(q, cl); out: - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); dev_put(dev); return skb->len; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 31570b9a6e9a..9e315bcbc072 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -196,14 +196,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (q->handle == handle) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return q; } } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return NULL; } @@ -838,7 +838,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) continue; if (idx > s_idx) s_q_idx = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); q_idx = 0; list_for_each_entry(q, &dev->qdisc_list, list) { if (q_idx < s_q_idx) { @@ -847,12 +847,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) } if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); goto done; } q_idx++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); } done: @@ -1075,7 +1075,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (t < s_t || !q->ops->cl_ops || (tcm->tcm_parent && @@ -1097,7 +1097,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) break; t++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); cb->args[0] = t; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 99ceb91f0150..0326752bd0ec 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -46,11 +46,10 @@ The idea is the following: - enqueue, dequeue are serialized via top level device spinlock dev->queue_lock. - - tree walking is protected by read_lock_bh(qdisc_tree_lock) + - tree walking is protected by read_lock(qdisc_tree_lock) and this lock is used only in process context. - - updates to tree are made under rtnl semaphore or - from softirq context (__qdisc_destroy rcu-callback) - hence this lock needs local bh disabling. + - updates to tree are made only under rtnl semaphore, + hence this lock may be made without local bh disabling. qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! */ @@ -58,14 +57,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); void qdisc_lock_tree(struct net_device *dev) { - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); spin_lock_bh(&dev->queue_lock); } void qdisc_unlock_tree(struct net_device *dev) { spin_unlock_bh(&dev->queue_lock); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } /* @@ -473,20 +472,6 @@ void qdisc_reset(struct Qdisc *qdisc) static void __qdisc_destroy(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); - struct Qdisc_ops *ops = qdisc->ops; - -#ifdef CONFIG_NET_ESTIMATOR - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); -#endif - write_lock(&qdisc_tree_lock); - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - write_unlock(&qdisc_tree_lock); - module_put(ops->owner); - - dev_put(qdisc->dev); kfree((char *) qdisc - qdisc->padded); } @@ -494,32 +479,23 @@ static void __qdisc_destroy(struct rcu_head *head) void qdisc_destroy(struct Qdisc *qdisc) { - struct list_head cql = LIST_HEAD_INIT(cql); - struct Qdisc *cq, *q, *n; + struct Qdisc_ops *ops = qdisc->ops; if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) + !atomic_dec_and_test(&qdisc->refcnt)) return; - if (!list_empty(&qdisc->list)) { - if (qdisc->ops->cl_ops == NULL) - list_del(&qdisc->list); - else - list_move(&qdisc->list, &cql); - } - - /* unlink inner qdiscs from dev->qdisc_list immediately */ - list_for_each_entry(cq, &cql, list) - list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) - if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { - if (q->ops->cl_ops == NULL) - list_del_init(&q->list); - else - list_move_tail(&q->list, &cql); - } - list_for_each_entry_safe(cq, n, &cql, list) - list_del_init(&cq->list); + list_del(&qdisc->list); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + module_put(ops->owner); + dev_put(qdisc->dev); call_rcu(&qdisc->q_rcu, __qdisc_destroy); } @@ -539,15 +515,15 @@ void dev_activate(struct net_device *dev) printk(KERN_INFO "%s: activation failed\n", dev->name); return; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); list_add_tail(&qdisc->list, &dev->qdisc_list); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } else { qdisc = &noqueue_qdisc; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); dev->qdisc_sleeping = qdisc; - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } if (!netif_carrier_ok(dev))