2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
32 #include <net/net_namespace.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
47 This file consists of two interrelated parts:
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
71 All real intelligent work is done inside qdisc modules.
75 Every discipline has two major routines: enqueue and dequeue.
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
102 like dequeue but without removing a packet from the queue
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops *qdisc_base;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops *qops)
139 struct Qdisc_ops *q, **qp;
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
164 write_unlock(&qdisc_mod_lock);
167 EXPORT_SYMBOL(register_qdisc);
169 int unregister_qdisc(struct Qdisc_ops *qops)
171 struct Qdisc_ops *q, **qp;
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
183 write_unlock(&qdisc_mod_lock);
186 EXPORT_SYMBOL(unregister_qdisc);
188 /* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
192 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
208 * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
209 * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
211 static DEFINE_SPINLOCK(qdisc_list_lock);
213 static void qdisc_list_add(struct Qdisc *q)
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
216 spin_lock_bh(&qdisc_list_lock);
217 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
218 spin_unlock_bh(&qdisc_list_lock);
222 void qdisc_list_del(struct Qdisc *q)
224 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
225 spin_lock_bh(&qdisc_list_lock);
227 spin_unlock_bh(&qdisc_list_lock);
230 EXPORT_SYMBOL(qdisc_list_del);
232 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
237 spin_lock_bh(&qdisc_list_lock);
239 for (i = 0; i < dev->num_tx_queues; i++) {
240 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
241 struct Qdisc *txq_root = txq->qdisc_sleeping;
243 q = qdisc_match_from_root(txq_root, handle);
248 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
251 spin_unlock_bh(&qdisc_list_lock);
256 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
260 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
264 cl = cops->get(p, classid);
268 leaf = cops->leaf(p, cl);
273 /* Find queueing discipline by name */
275 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
277 struct Qdisc_ops *q = NULL;
280 read_lock(&qdisc_mod_lock);
281 for (q = qdisc_base; q; q = q->next) {
282 if (nla_strcmp(kind, q->id) == 0) {
283 if (!try_module_get(q->owner))
288 read_unlock(&qdisc_mod_lock);
293 static struct qdisc_rate_table *qdisc_rtab_list;
295 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
297 struct qdisc_rate_table *rtab;
299 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
300 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
306 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
307 nla_len(tab) != TC_RTAB_SIZE)
310 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
314 memcpy(rtab->data, nla_data(tab), 1024);
315 rtab->next = qdisc_rtab_list;
316 qdisc_rtab_list = rtab;
320 EXPORT_SYMBOL(qdisc_get_rtab);
322 void qdisc_put_rtab(struct qdisc_rate_table *tab)
324 struct qdisc_rate_table *rtab, **rtabp;
326 if (!tab || --tab->refcnt)
329 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
337 EXPORT_SYMBOL(qdisc_put_rtab);
339 static LIST_HEAD(qdisc_stab_list);
340 static DEFINE_SPINLOCK(qdisc_stab_lock);
342 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
343 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
344 [TCA_STAB_DATA] = { .type = NLA_BINARY },
347 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
349 struct nlattr *tb[TCA_STAB_MAX + 1];
350 struct qdisc_size_table *stab;
351 struct tc_sizespec *s;
352 unsigned int tsize = 0;
356 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
359 if (!tb[TCA_STAB_BASE])
360 return ERR_PTR(-EINVAL);
362 s = nla_data(tb[TCA_STAB_BASE]);
365 if (!tb[TCA_STAB_DATA])
366 return ERR_PTR(-EINVAL);
367 tab = nla_data(tb[TCA_STAB_DATA]);
368 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
371 if (!s || tsize != s->tsize || (!tab && tsize > 0))
372 return ERR_PTR(-EINVAL);
374 spin_lock(&qdisc_stab_lock);
376 list_for_each_entry(stab, &qdisc_stab_list, list) {
377 if (memcmp(&stab->szopts, s, sizeof(*s)))
379 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
382 spin_unlock(&qdisc_stab_lock);
386 spin_unlock(&qdisc_stab_lock);
388 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
390 return ERR_PTR(-ENOMEM);
395 memcpy(stab->data, tab, tsize * sizeof(u16));
397 spin_lock(&qdisc_stab_lock);
398 list_add_tail(&stab->list, &qdisc_stab_list);
399 spin_unlock(&qdisc_stab_lock);
404 void qdisc_put_stab(struct qdisc_size_table *tab)
409 spin_lock(&qdisc_stab_lock);
411 if (--tab->refcnt == 0) {
412 list_del(&tab->list);
416 spin_unlock(&qdisc_stab_lock);
418 EXPORT_SYMBOL(qdisc_put_stab);
420 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
424 nest = nla_nest_start(skb, TCA_STAB);
426 goto nla_put_failure;
427 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
428 nla_nest_end(skb, nest);
436 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
440 pkt_len = skb->len + stab->szopts.overhead;
441 if (unlikely(!stab->szopts.tsize))
444 slot = pkt_len + stab->szopts.cell_align;
445 if (unlikely(slot < 0))
448 slot >>= stab->szopts.cell_log;
449 if (likely(slot < stab->szopts.tsize))
450 pkt_len = stab->data[slot];
452 pkt_len = stab->data[stab->szopts.tsize - 1] *
453 (slot / stab->szopts.tsize) +
454 stab->data[slot % stab->szopts.tsize];
456 pkt_len <<= stab->szopts.size_log;
458 if (unlikely(pkt_len < 1))
460 qdisc_skb_cb(skb)->pkt_len = pkt_len;
462 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
464 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
466 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
469 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
471 __netif_schedule(qdisc_root(wd->qdisc));
473 return HRTIMER_NORESTART;
476 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
478 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
479 wd->timer.function = qdisc_watchdog;
482 EXPORT_SYMBOL(qdisc_watchdog_init);
484 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
488 if (test_bit(__QDISC_STATE_DEACTIVATED,
489 &qdisc_root_sleeping(wd->qdisc)->state))
492 wd->qdisc->flags |= TCQ_F_THROTTLED;
493 time = ktime_set(0, 0);
494 time = ktime_add_ns(time, PSCHED_US2NS(expires));
495 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
497 EXPORT_SYMBOL(qdisc_watchdog_schedule);
499 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
501 hrtimer_cancel(&wd->timer);
502 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
504 EXPORT_SYMBOL(qdisc_watchdog_cancel);
506 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
508 unsigned int size = n * sizeof(struct hlist_head), i;
509 struct hlist_head *h;
511 if (size <= PAGE_SIZE)
512 h = kmalloc(size, GFP_KERNEL);
514 h = (struct hlist_head *)
515 __get_free_pages(GFP_KERNEL, get_order(size));
518 for (i = 0; i < n; i++)
519 INIT_HLIST_HEAD(&h[i]);
524 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
526 unsigned int size = n * sizeof(struct hlist_head);
528 if (size <= PAGE_SIZE)
531 free_pages((unsigned long)h, get_order(size));
534 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
536 struct Qdisc_class_common *cl;
537 struct hlist_node *n, *next;
538 struct hlist_head *nhash, *ohash;
539 unsigned int nsize, nmask, osize;
542 /* Rehash when load factor exceeds 0.75 */
543 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
545 nsize = clhash->hashsize * 2;
547 nhash = qdisc_class_hash_alloc(nsize);
551 ohash = clhash->hash;
552 osize = clhash->hashsize;
555 for (i = 0; i < osize; i++) {
556 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
557 h = qdisc_class_hash(cl->classid, nmask);
558 hlist_add_head(&cl->hnode, &nhash[h]);
561 clhash->hash = nhash;
562 clhash->hashsize = nsize;
563 clhash->hashmask = nmask;
564 sch_tree_unlock(sch);
566 qdisc_class_hash_free(ohash, osize);
568 EXPORT_SYMBOL(qdisc_class_hash_grow);
570 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
572 unsigned int size = 4;
574 clhash->hash = qdisc_class_hash_alloc(size);
575 if (clhash->hash == NULL)
577 clhash->hashsize = size;
578 clhash->hashmask = size - 1;
579 clhash->hashelems = 0;
582 EXPORT_SYMBOL(qdisc_class_hash_init);
584 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
586 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
588 EXPORT_SYMBOL(qdisc_class_hash_destroy);
590 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
591 struct Qdisc_class_common *cl)
595 INIT_HLIST_NODE(&cl->hnode);
596 h = qdisc_class_hash(cl->classid, clhash->hashmask);
597 hlist_add_head(&cl->hnode, &clhash->hash[h]);
600 EXPORT_SYMBOL(qdisc_class_hash_insert);
602 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
603 struct Qdisc_class_common *cl)
605 hlist_del(&cl->hnode);
608 EXPORT_SYMBOL(qdisc_class_hash_remove);
610 /* Allocate an unique handle from space managed by kernel */
612 static u32 qdisc_alloc_handle(struct net_device *dev)
615 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
618 autohandle += TC_H_MAKE(0x10000U, 0);
619 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
620 autohandle = TC_H_MAKE(0x80000000U, 0);
621 } while (qdisc_lookup(dev, autohandle) && --i > 0);
623 return i>0 ? autohandle : 0;
626 /* Attach toplevel qdisc to device queue. */
628 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
631 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
632 spinlock_t *root_lock;
634 root_lock = qdisc_lock(oqdisc);
635 spin_lock_bh(root_lock);
637 /* Prune old scheduler */
638 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
641 /* ... and graft new one */
644 dev_queue->qdisc_sleeping = qdisc;
645 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
647 spin_unlock_bh(root_lock);
652 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
654 const struct Qdisc_class_ops *cops;
660 while ((parentid = sch->parent)) {
661 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
664 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
666 WARN_ON(parentid != TC_H_ROOT);
669 cops = sch->ops->cl_ops;
670 if (cops->qlen_notify) {
671 cl = cops->get(sch, parentid);
672 cops->qlen_notify(sch, cl);
678 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
680 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
681 struct Qdisc *old, struct Qdisc *new)
684 qdisc_notify(skb, n, clid, old, new);
690 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
693 * When appropriate send a netlink notification using 'skb'
696 * On success, destroy old qdisc.
699 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
700 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
701 struct Qdisc *new, struct Qdisc *old)
703 struct Qdisc *q = old;
706 if (parent == NULL) {
707 unsigned int i, num_q, ingress;
710 num_q = dev->num_tx_queues;
711 if ((q && q->flags & TCQ_F_INGRESS) ||
712 (new && new->flags & TCQ_F_INGRESS)) {
717 if (dev->flags & IFF_UP)
720 for (i = 0; i < num_q; i++) {
721 struct netdev_queue *dev_queue = &dev->rx_queue;
724 dev_queue = netdev_get_tx_queue(dev, i);
726 old = dev_graft_qdisc(dev_queue, new);
728 atomic_inc(&new->refcnt);
730 notify_and_destroy(skb, n, classid, old, new);
733 if (dev->flags & IFF_UP)
736 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
741 unsigned long cl = cops->get(parent, classid);
743 err = cops->graft(parent, cl, new, &old);
744 cops->put(parent, cl);
748 notify_and_destroy(skb, n, classid, old, new);
753 /* lockdep annotation is needed for ingress; egress gets it only for name */
754 static struct lock_class_key qdisc_tx_lock;
755 static struct lock_class_key qdisc_rx_lock;
758 Allocate and initialize new qdisc.
760 Parameters are passed via opt.
763 static struct Qdisc *
764 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
765 u32 parent, u32 handle, struct nlattr **tca, int *errp)
768 struct nlattr *kind = tca[TCA_KIND];
770 struct Qdisc_ops *ops;
771 struct qdisc_size_table *stab;
773 ops = qdisc_lookup_ops(kind);
774 #ifdef CONFIG_MODULES
775 if (ops == NULL && kind != NULL) {
777 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
778 /* We dropped the RTNL semaphore in order to
779 * perform the module load. So, even if we
780 * succeeded in loading the module we have to
781 * tell the caller to replay the request. We
782 * indicate this using -EAGAIN.
783 * We replay the request because the device may
784 * go away in the mean time.
787 request_module("sch_%s", name);
789 ops = qdisc_lookup_ops(kind);
791 /* We will try again qdisc_lookup_ops,
792 * so don't keep a reference.
794 module_put(ops->owner);
806 sch = qdisc_alloc(dev_queue, ops);
812 sch->parent = parent;
814 if (handle == TC_H_INGRESS) {
815 sch->flags |= TCQ_F_INGRESS;
816 handle = TC_H_MAKE(TC_H_INGRESS, 0);
817 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
820 handle = qdisc_alloc_handle(dev);
825 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
828 sch->handle = handle;
830 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
832 stab = qdisc_get_stab(tca[TCA_STAB]);
840 spinlock_t *root_lock;
842 if ((sch->parent != TC_H_ROOT) &&
843 !(sch->flags & TCQ_F_INGRESS))
844 root_lock = qdisc_root_sleeping_lock(sch);
846 root_lock = qdisc_lock(sch);
848 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
849 root_lock, tca[TCA_RATE]);
852 * Any broken qdiscs that would require
853 * a ops->reset() here? The qdisc was never
854 * in action so it shouldn't be necessary.
867 qdisc_put_stab(sch->stab);
869 kfree((char *) sch - sch->padded);
871 module_put(ops->owner);
877 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
879 struct qdisc_size_table *stab = NULL;
882 if (tca[TCA_OPTIONS]) {
883 if (sch->ops->change == NULL)
885 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
891 stab = qdisc_get_stab(tca[TCA_STAB]);
893 return PTR_ERR(stab);
896 qdisc_put_stab(sch->stab);
900 gen_replace_estimator(&sch->bstats, &sch->rate_est,
901 qdisc_root_sleeping_lock(sch),
906 struct check_loop_arg
908 struct qdisc_walker w;
913 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
915 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
917 struct check_loop_arg arg;
919 if (q->ops->cl_ops == NULL)
922 arg.w.stop = arg.w.skip = arg.w.count = 0;
923 arg.w.fn = check_loop_fn;
926 q->ops->cl_ops->walk(q, &arg.w);
927 return arg.w.stop ? -ELOOP : 0;
931 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
934 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
935 struct check_loop_arg *arg = (struct check_loop_arg *)w;
937 leaf = cops->leaf(q, cl);
939 if (leaf == arg->p || arg->depth > 7)
941 return check_loop(leaf, arg->p, arg->depth + 1);
950 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
952 struct net *net = sock_net(skb->sk);
953 struct tcmsg *tcm = NLMSG_DATA(n);
954 struct nlattr *tca[TCA_MAX + 1];
955 struct net_device *dev;
956 u32 clid = tcm->tcm_parent;
957 struct Qdisc *q = NULL;
958 struct Qdisc *p = NULL;
961 if (net != &init_net)
964 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
967 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
972 if (clid != TC_H_ROOT) {
973 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
974 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
976 q = qdisc_leaf(p, clid);
977 } else { /* ingress */
978 q = dev->rx_queue.qdisc_sleeping;
981 struct netdev_queue *dev_queue;
982 dev_queue = netdev_get_tx_queue(dev, 0);
983 q = dev_queue->qdisc_sleeping;
988 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
991 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
995 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
998 if (n->nlmsg_type == RTM_DELQDISC) {
1003 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1006 qdisc_notify(skb, n, clid, NULL, q);
1012 Create/change qdisc.
1015 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1017 struct net *net = sock_net(skb->sk);
1019 struct nlattr *tca[TCA_MAX + 1];
1020 struct net_device *dev;
1022 struct Qdisc *q, *p;
1025 if (net != &init_net)
1029 /* Reinit, just in case something touches this. */
1030 tcm = NLMSG_DATA(n);
1031 clid = tcm->tcm_parent;
1034 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1037 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1042 if (clid != TC_H_ROOT) {
1043 if (clid != TC_H_INGRESS) {
1044 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1046 q = qdisc_leaf(p, clid);
1047 } else { /*ingress */
1048 q = dev->rx_queue.qdisc_sleeping;
1051 struct netdev_queue *dev_queue;
1052 dev_queue = netdev_get_tx_queue(dev, 0);
1053 q = dev_queue->qdisc_sleeping;
1056 /* It may be default qdisc, ignore it */
1057 if (q && q->handle == 0)
1060 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1061 if (tcm->tcm_handle) {
1062 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1064 if (TC_H_MIN(tcm->tcm_handle))
1066 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1067 goto create_n_graft;
1068 if (n->nlmsg_flags&NLM_F_EXCL)
1070 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1073 (p && check_loop(q, p, 0)))
1075 atomic_inc(&q->refcnt);
1079 goto create_n_graft;
1081 /* This magic test requires explanation.
1083 * We know, that some child q is already
1084 * attached to this parent and have choice:
1085 * either to change it or to create/graft new one.
1087 * 1. We are allowed to create/graft only
1088 * if CREATE and REPLACE flags are set.
1090 * 2. If EXCL is set, requestor wanted to say,
1091 * that qdisc tcm_handle is not expected
1092 * to exist, so that we choose create/graft too.
1094 * 3. The last case is when no flags are set.
1095 * Alas, it is sort of hole in API, we
1096 * cannot decide what to do unambiguously.
1097 * For now we select create/graft, if
1098 * user gave KIND, which does not match existing.
1100 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1101 (n->nlmsg_flags&NLM_F_REPLACE) &&
1102 ((n->nlmsg_flags&NLM_F_EXCL) ||
1104 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1105 goto create_n_graft;
1109 if (!tcm->tcm_handle)
1111 q = qdisc_lookup(dev, tcm->tcm_handle);
1114 /* Change qdisc parameters */
1117 if (n->nlmsg_flags&NLM_F_EXCL)
1119 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1121 err = qdisc_change(q, tca);
1123 qdisc_notify(skb, n, clid, NULL, q);
1127 if (!(n->nlmsg_flags&NLM_F_CREATE))
1129 if (clid == TC_H_INGRESS)
1130 q = qdisc_create(dev, &dev->rx_queue,
1131 tcm->tcm_parent, tcm->tcm_parent,
1134 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1135 tcm->tcm_parent, tcm->tcm_handle,
1144 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1154 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1155 u32 pid, u32 seq, u16 flags, int event)
1158 struct nlmsghdr *nlh;
1159 unsigned char *b = skb_tail_pointer(skb);
1162 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1163 tcm = NLMSG_DATA(nlh);
1164 tcm->tcm_family = AF_UNSPEC;
1167 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1168 tcm->tcm_parent = clid;
1169 tcm->tcm_handle = q->handle;
1170 tcm->tcm_info = atomic_read(&q->refcnt);
1171 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1172 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1173 goto nla_put_failure;
1174 q->qstats.qlen = q->q.qlen;
1176 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1177 goto nla_put_failure;
1179 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1180 qdisc_root_sleeping_lock(q), &d) < 0)
1181 goto nla_put_failure;
1183 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1184 goto nla_put_failure;
1186 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1187 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1188 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1189 goto nla_put_failure;
1191 if (gnet_stats_finish_copy(&d) < 0)
1192 goto nla_put_failure;
1194 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1203 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1204 u32 clid, struct Qdisc *old, struct Qdisc *new)
1206 struct sk_buff *skb;
1207 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1209 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1213 if (old && old->handle) {
1214 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1218 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1223 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1230 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1232 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1235 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1236 struct netlink_callback *cb,
1237 int *q_idx_p, int s_q_idx)
1239 int ret = 0, q_idx = *q_idx_p;
1246 if (q_idx < s_q_idx) {
1249 if (!tc_qdisc_dump_ignore(q) &&
1250 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1251 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1255 list_for_each_entry(q, &root->list, list) {
1256 if (q_idx < s_q_idx) {
1260 if (!tc_qdisc_dump_ignore(q) &&
1261 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1262 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1275 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1277 struct net *net = sock_net(skb->sk);
1280 struct net_device *dev;
1282 if (net != &init_net)
1285 s_idx = cb->args[0];
1286 s_q_idx = q_idx = cb->args[1];
1287 read_lock(&dev_base_lock);
1289 for_each_netdev(&init_net, dev) {
1290 struct netdev_queue *dev_queue;
1298 dev_queue = netdev_get_tx_queue(dev, 0);
1299 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1302 dev_queue = &dev->rx_queue;
1303 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1311 read_unlock(&dev_base_lock);
1314 cb->args[1] = q_idx;
1321 /************************************************
1322 * Traffic classes manipulation. *
1323 ************************************************/
1327 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1329 struct net *net = sock_net(skb->sk);
1330 struct netdev_queue *dev_queue;
1331 struct tcmsg *tcm = NLMSG_DATA(n);
1332 struct nlattr *tca[TCA_MAX + 1];
1333 struct net_device *dev;
1334 struct Qdisc *q = NULL;
1335 const struct Qdisc_class_ops *cops;
1336 unsigned long cl = 0;
1337 unsigned long new_cl;
1338 u32 pid = tcm->tcm_parent;
1339 u32 clid = tcm->tcm_handle;
1340 u32 qid = TC_H_MAJ(clid);
1343 if (net != &init_net)
1346 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1349 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1354 parent == TC_H_UNSPEC - unspecified parent.
1355 parent == TC_H_ROOT - class is root, which has no parent.
1356 parent == X:0 - parent is root class.
1357 parent == X:Y - parent is a node in hierarchy.
1358 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1360 handle == 0:0 - generate handle from kernel pool.
1361 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1362 handle == X:Y - clear.
1363 handle == X:0 - root class.
1366 /* Step 1. Determine qdisc handle X:0 */
1368 dev_queue = netdev_get_tx_queue(dev, 0);
1369 if (pid != TC_H_ROOT) {
1370 u32 qid1 = TC_H_MAJ(pid);
1373 /* If both majors are known, they must be identical. */
1378 } else if (qid == 0)
1379 qid = dev_queue->qdisc_sleeping->handle;
1381 /* Now qid is genuine qdisc handle consistent
1382 both with parent and child.
1384 TC_H_MAJ(pid) still may be unspecified, complete it now.
1387 pid = TC_H_MAKE(qid, pid);
1390 qid = dev_queue->qdisc_sleeping->handle;
1393 /* OK. Locate qdisc */
1394 if ((q = qdisc_lookup(dev, qid)) == NULL)
1397 /* An check that it supports classes */
1398 cops = q->ops->cl_ops;
1402 /* Now try to get class */
1404 if (pid == TC_H_ROOT)
1407 clid = TC_H_MAKE(qid, clid);
1410 cl = cops->get(q, clid);
1414 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1417 switch (n->nlmsg_type) {
1420 if (n->nlmsg_flags&NLM_F_EXCL)
1424 err = cops->delete(q, cl);
1426 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1429 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1438 err = cops->change(q, clid, pid, tca, &new_cl);
1440 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1450 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1452 u32 pid, u32 seq, u16 flags, int event)
1455 struct nlmsghdr *nlh;
1456 unsigned char *b = skb_tail_pointer(skb);
1458 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1460 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1461 tcm = NLMSG_DATA(nlh);
1462 tcm->tcm_family = AF_UNSPEC;
1463 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1464 tcm->tcm_parent = q->handle;
1465 tcm->tcm_handle = q->handle;
1467 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1468 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1469 goto nla_put_failure;
1471 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1472 qdisc_root_sleeping_lock(q), &d) < 0)
1473 goto nla_put_failure;
1475 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1476 goto nla_put_failure;
1478 if (gnet_stats_finish_copy(&d) < 0)
1479 goto nla_put_failure;
1481 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1490 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1491 struct Qdisc *q, unsigned long cl, int event)
1493 struct sk_buff *skb;
1494 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1496 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1500 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1505 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1508 struct qdisc_dump_args
1510 struct qdisc_walker w;
1511 struct sk_buff *skb;
1512 struct netlink_callback *cb;
1515 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1517 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1519 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1520 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1523 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1524 struct tcmsg *tcm, struct netlink_callback *cb,
1527 struct qdisc_dump_args arg;
1529 if (tc_qdisc_dump_ignore(q) ||
1530 *t_p < s_t || !q->ops->cl_ops ||
1532 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1537 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1538 arg.w.fn = qdisc_class_dump;
1542 arg.w.skip = cb->args[1];
1544 q->ops->cl_ops->walk(q, &arg.w);
1545 cb->args[1] = arg.w.count;
1552 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1553 struct tcmsg *tcm, struct netlink_callback *cb,
1561 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1564 list_for_each_entry(q, &root->list, list) {
1565 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1572 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1574 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1575 struct net *net = sock_net(skb->sk);
1576 struct netdev_queue *dev_queue;
1577 struct net_device *dev;
1580 if (net != &init_net)
1583 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1585 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1591 dev_queue = netdev_get_tx_queue(dev, 0);
1592 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1595 dev_queue = &dev->rx_queue;
1596 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1606 /* Main classifier routine: scans classifier chain attached
1607 to this qdisc, (optionally) tests for protocol and asks
1608 specific classifiers.
1610 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1611 struct tcf_result *res)
1613 __be16 protocol = skb->protocol;
1616 for (; tp; tp = tp->next) {
1617 if ((tp->protocol == protocol ||
1618 tp->protocol == htons(ETH_P_ALL)) &&
1619 (err = tp->classify(skb, tp, res)) >= 0) {
1620 #ifdef CONFIG_NET_CLS_ACT
1621 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1622 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1629 EXPORT_SYMBOL(tc_classify_compat);
1631 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1632 struct tcf_result *res)
1636 #ifdef CONFIG_NET_CLS_ACT
1637 struct tcf_proto *otp = tp;
1640 protocol = skb->protocol;
1642 err = tc_classify_compat(skb, tp, res);
1643 #ifdef CONFIG_NET_CLS_ACT
1644 if (err == TC_ACT_RECLASSIFY) {
1645 u32 verd = G_TC_VERD(skb->tc_verd);
1648 if (verd++ >= MAX_REC_LOOP) {
1649 printk("rule prio %u protocol %02x reclassify loop, "
1651 tp->prio&0xffff, ntohs(tp->protocol));
1654 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1660 EXPORT_SYMBOL(tc_classify);
1662 void tcf_destroy(struct tcf_proto *tp)
1664 tp->ops->destroy(tp);
1665 module_put(tp->ops->owner);
1669 void tcf_destroy_chain(struct tcf_proto **fl)
1671 struct tcf_proto *tp;
1673 while ((tp = *fl) != NULL) {
1678 EXPORT_SYMBOL(tcf_destroy_chain);
1680 #ifdef CONFIG_PROC_FS
1681 static int psched_show(struct seq_file *seq, void *v)
1685 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1686 seq_printf(seq, "%08x %08x %08x %08x\n",
1687 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1689 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1694 static int psched_open(struct inode *inode, struct file *file)
1696 return single_open(file, psched_show, PDE(inode)->data);
1699 static const struct file_operations psched_fops = {
1700 .owner = THIS_MODULE,
1701 .open = psched_open,
1703 .llseek = seq_lseek,
1704 .release = single_release,
1708 static int __init pktsched_init(void)
1710 register_qdisc(&pfifo_qdisc_ops);
1711 register_qdisc(&bfifo_qdisc_ops);
1712 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1714 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1715 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1716 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1717 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1718 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1719 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1724 subsys_initcall(pktsched_init);