2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
32 #include <net/net_namespace.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 struct Qdisc *q, unsigned long cl, int event);
47 This file consists of two interrelated parts:
49 1. queueing disciplines manager frontend.
50 2. traffic classes manager frontend.
52 Generally, queueing discipline ("qdisc") is a black box,
53 which is able to enqueue packets and to dequeue them (when
54 device is ready to send something) in order and at times
55 determined by algorithm hidden in it.
57 qdisc's are divided to two categories:
58 - "queues", which have no internal structure visible from outside.
59 - "schedulers", which split all the packets to "traffic classes",
60 using "packet classifiers" (look at cls_api.c)
62 In turn, classes may have child qdiscs (as rule, queues)
63 attached to them etc. etc. etc.
65 The goal of the routines in this file is to translate
66 information supplied by user in the form of handles
67 to more intelligible for kernel form, to make some sanity
68 checks and part of work, which is common to all qdiscs
69 and to provide rtnetlink notifications.
71 All real intelligent work is done inside qdisc modules.
75 Every discipline has two major routines: enqueue and dequeue.
79 dequeue usually returns a skb to send. It is allowed to return NULL,
80 but it does not mean that queue is empty, it just means that
81 discipline does not want to send anything this time.
82 Queue is really empty if q->q.qlen == 0.
83 For complicated disciplines with multiple queues q->q is not
84 real packet queue, but however q->q.qlen must be valid.
88 enqueue returns 0, if packet was enqueued successfully.
89 If packet (this one or another one) was dropped, it returns
91 NET_XMIT_DROP - this packet dropped
92 Expected action: do not backoff, but wait until queue will clear.
93 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
94 Expected action: backoff or ignore
95 NET_XMIT_POLICED - dropped by police.
96 Expected action: backoff or error to real-time apps.
102 like dequeue but without removing a packet from the queue
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops *qdisc_base;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops *qops)
139 struct Qdisc_ops *q, **qp;
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->peek == NULL) {
150 if (qops->dequeue == NULL) {
151 qops->peek = noop_qdisc_ops.peek;
157 if (qops->dequeue == NULL)
158 qops->dequeue = noop_qdisc_ops.dequeue;
164 write_unlock(&qdisc_mod_lock);
167 EXPORT_SYMBOL(register_qdisc);
169 int unregister_qdisc(struct Qdisc_ops *qops)
171 struct Qdisc_ops *q, **qp;
174 write_lock(&qdisc_mod_lock);
175 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
183 write_unlock(&qdisc_mod_lock);
186 EXPORT_SYMBOL(unregister_qdisc);
188 /* We know handle. Find qdisc among all qdisc's attached to device
189 (root qdisc, all its children, children of children etc.)
192 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
196 if (!(root->flags & TCQ_F_BUILTIN) &&
197 root->handle == handle)
200 list_for_each_entry(q, &root->list, list) {
201 if (q->handle == handle)
208 * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
209 * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
211 static DEFINE_SPINLOCK(qdisc_list_lock);
213 static void qdisc_list_add(struct Qdisc *q)
215 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
216 spin_lock_bh(&qdisc_list_lock);
217 list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
218 spin_unlock_bh(&qdisc_list_lock);
222 void qdisc_list_del(struct Qdisc *q)
224 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
225 spin_lock_bh(&qdisc_list_lock);
227 spin_unlock_bh(&qdisc_list_lock);
230 EXPORT_SYMBOL(qdisc_list_del);
232 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
237 spin_lock_bh(&qdisc_list_lock);
239 for (i = 0; i < dev->num_tx_queues; i++) {
240 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
241 struct Qdisc *txq_root = txq->qdisc_sleeping;
243 q = qdisc_match_from_root(txq_root, handle);
248 q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
251 spin_unlock_bh(&qdisc_list_lock);
256 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
260 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
264 cl = cops->get(p, classid);
268 leaf = cops->leaf(p, cl);
273 /* Find queueing discipline by name */
275 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
277 struct Qdisc_ops *q = NULL;
280 read_lock(&qdisc_mod_lock);
281 for (q = qdisc_base; q; q = q->next) {
282 if (nla_strcmp(kind, q->id) == 0) {
283 if (!try_module_get(q->owner))
288 read_unlock(&qdisc_mod_lock);
293 static struct qdisc_rate_table *qdisc_rtab_list;
295 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
297 struct qdisc_rate_table *rtab;
299 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
300 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
306 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
307 nla_len(tab) != TC_RTAB_SIZE)
310 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
314 memcpy(rtab->data, nla_data(tab), 1024);
315 rtab->next = qdisc_rtab_list;
316 qdisc_rtab_list = rtab;
320 EXPORT_SYMBOL(qdisc_get_rtab);
322 void qdisc_put_rtab(struct qdisc_rate_table *tab)
324 struct qdisc_rate_table *rtab, **rtabp;
326 if (!tab || --tab->refcnt)
329 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
337 EXPORT_SYMBOL(qdisc_put_rtab);
339 static LIST_HEAD(qdisc_stab_list);
340 static DEFINE_SPINLOCK(qdisc_stab_lock);
342 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
343 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
344 [TCA_STAB_DATA] = { .type = NLA_BINARY },
347 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
349 struct nlattr *tb[TCA_STAB_MAX + 1];
350 struct qdisc_size_table *stab;
351 struct tc_sizespec *s;
352 unsigned int tsize = 0;
356 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
359 if (!tb[TCA_STAB_BASE])
360 return ERR_PTR(-EINVAL);
362 s = nla_data(tb[TCA_STAB_BASE]);
365 if (!tb[TCA_STAB_DATA])
366 return ERR_PTR(-EINVAL);
367 tab = nla_data(tb[TCA_STAB_DATA]);
368 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
371 if (!s || tsize != s->tsize || (!tab && tsize > 0))
372 return ERR_PTR(-EINVAL);
374 spin_lock(&qdisc_stab_lock);
376 list_for_each_entry(stab, &qdisc_stab_list, list) {
377 if (memcmp(&stab->szopts, s, sizeof(*s)))
379 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
382 spin_unlock(&qdisc_stab_lock);
386 spin_unlock(&qdisc_stab_lock);
388 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
390 return ERR_PTR(-ENOMEM);
395 memcpy(stab->data, tab, tsize * sizeof(u16));
397 spin_lock(&qdisc_stab_lock);
398 list_add_tail(&stab->list, &qdisc_stab_list);
399 spin_unlock(&qdisc_stab_lock);
404 void qdisc_put_stab(struct qdisc_size_table *tab)
409 spin_lock(&qdisc_stab_lock);
411 if (--tab->refcnt == 0) {
412 list_del(&tab->list);
416 spin_unlock(&qdisc_stab_lock);
418 EXPORT_SYMBOL(qdisc_put_stab);
420 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
424 nest = nla_nest_start(skb, TCA_STAB);
425 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
426 nla_nest_end(skb, nest);
434 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
438 pkt_len = skb->len + stab->szopts.overhead;
439 if (unlikely(!stab->szopts.tsize))
442 slot = pkt_len + stab->szopts.cell_align;
443 if (unlikely(slot < 0))
446 slot >>= stab->szopts.cell_log;
447 if (likely(slot < stab->szopts.tsize))
448 pkt_len = stab->data[slot];
450 pkt_len = stab->data[stab->szopts.tsize - 1] *
451 (slot / stab->szopts.tsize) +
452 stab->data[slot % stab->szopts.tsize];
454 pkt_len <<= stab->szopts.size_log;
456 if (unlikely(pkt_len < 1))
458 qdisc_skb_cb(skb)->pkt_len = pkt_len;
460 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
462 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
464 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
467 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
469 __netif_schedule(qdisc_root(wd->qdisc));
471 return HRTIMER_NORESTART;
474 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
476 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
477 wd->timer.function = qdisc_watchdog;
480 EXPORT_SYMBOL(qdisc_watchdog_init);
482 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
486 if (test_bit(__QDISC_STATE_DEACTIVATED,
487 &qdisc_root_sleeping(wd->qdisc)->state))
490 wd->qdisc->flags |= TCQ_F_THROTTLED;
491 time = ktime_set(0, 0);
492 time = ktime_add_ns(time, PSCHED_US2NS(expires));
493 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
495 EXPORT_SYMBOL(qdisc_watchdog_schedule);
497 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
499 hrtimer_cancel(&wd->timer);
500 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
502 EXPORT_SYMBOL(qdisc_watchdog_cancel);
504 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
506 unsigned int size = n * sizeof(struct hlist_head), i;
507 struct hlist_head *h;
509 if (size <= PAGE_SIZE)
510 h = kmalloc(size, GFP_KERNEL);
512 h = (struct hlist_head *)
513 __get_free_pages(GFP_KERNEL, get_order(size));
516 for (i = 0; i < n; i++)
517 INIT_HLIST_HEAD(&h[i]);
522 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
524 unsigned int size = n * sizeof(struct hlist_head);
526 if (size <= PAGE_SIZE)
529 free_pages((unsigned long)h, get_order(size));
532 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
534 struct Qdisc_class_common *cl;
535 struct hlist_node *n, *next;
536 struct hlist_head *nhash, *ohash;
537 unsigned int nsize, nmask, osize;
540 /* Rehash when load factor exceeds 0.75 */
541 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
543 nsize = clhash->hashsize * 2;
545 nhash = qdisc_class_hash_alloc(nsize);
549 ohash = clhash->hash;
550 osize = clhash->hashsize;
553 for (i = 0; i < osize; i++) {
554 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
555 h = qdisc_class_hash(cl->classid, nmask);
556 hlist_add_head(&cl->hnode, &nhash[h]);
559 clhash->hash = nhash;
560 clhash->hashsize = nsize;
561 clhash->hashmask = nmask;
562 sch_tree_unlock(sch);
564 qdisc_class_hash_free(ohash, osize);
566 EXPORT_SYMBOL(qdisc_class_hash_grow);
568 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
570 unsigned int size = 4;
572 clhash->hash = qdisc_class_hash_alloc(size);
573 if (clhash->hash == NULL)
575 clhash->hashsize = size;
576 clhash->hashmask = size - 1;
577 clhash->hashelems = 0;
580 EXPORT_SYMBOL(qdisc_class_hash_init);
582 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
584 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
586 EXPORT_SYMBOL(qdisc_class_hash_destroy);
588 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
589 struct Qdisc_class_common *cl)
593 INIT_HLIST_NODE(&cl->hnode);
594 h = qdisc_class_hash(cl->classid, clhash->hashmask);
595 hlist_add_head(&cl->hnode, &clhash->hash[h]);
598 EXPORT_SYMBOL(qdisc_class_hash_insert);
600 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
601 struct Qdisc_class_common *cl)
603 hlist_del(&cl->hnode);
606 EXPORT_SYMBOL(qdisc_class_hash_remove);
608 /* Allocate an unique handle from space managed by kernel */
610 static u32 qdisc_alloc_handle(struct net_device *dev)
613 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
616 autohandle += TC_H_MAKE(0x10000U, 0);
617 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
618 autohandle = TC_H_MAKE(0x80000000U, 0);
619 } while (qdisc_lookup(dev, autohandle) && --i > 0);
621 return i>0 ? autohandle : 0;
624 /* Attach toplevel qdisc to device queue. */
626 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
629 struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
630 spinlock_t *root_lock;
632 root_lock = qdisc_lock(oqdisc);
633 spin_lock_bh(root_lock);
635 /* Prune old scheduler */
636 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
639 /* ... and graft new one */
642 dev_queue->qdisc_sleeping = qdisc;
643 rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
645 spin_unlock_bh(root_lock);
650 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
652 const struct Qdisc_class_ops *cops;
658 while ((parentid = sch->parent)) {
659 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
662 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
664 WARN_ON(parentid != TC_H_ROOT);
667 cops = sch->ops->cl_ops;
668 if (cops->qlen_notify) {
669 cl = cops->get(sch, parentid);
670 cops->qlen_notify(sch, cl);
676 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
678 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
679 struct Qdisc *old, struct Qdisc *new)
682 qdisc_notify(skb, n, clid, old, new);
688 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
691 * When appropriate send a netlink notification using 'skb'
694 * On success, destroy old qdisc.
697 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
698 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
699 struct Qdisc *new, struct Qdisc *old)
701 struct Qdisc *q = old;
704 if (parent == NULL) {
705 unsigned int i, num_q, ingress;
708 num_q = dev->num_tx_queues;
709 if ((q && q->flags & TCQ_F_INGRESS) ||
710 (new && new->flags & TCQ_F_INGRESS)) {
715 if (dev->flags & IFF_UP)
718 for (i = 0; i < num_q; i++) {
719 struct netdev_queue *dev_queue = &dev->rx_queue;
722 dev_queue = netdev_get_tx_queue(dev, i);
724 old = dev_graft_qdisc(dev_queue, new);
726 atomic_inc(&new->refcnt);
728 notify_and_destroy(skb, n, classid, old, new);
731 if (dev->flags & IFF_UP)
734 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
739 unsigned long cl = cops->get(parent, classid);
741 err = cops->graft(parent, cl, new, &old);
742 cops->put(parent, cl);
746 notify_and_destroy(skb, n, classid, old, new);
751 /* lockdep annotation is needed for ingress; egress gets it only for name */
752 static struct lock_class_key qdisc_tx_lock;
753 static struct lock_class_key qdisc_rx_lock;
756 Allocate and initialize new qdisc.
758 Parameters are passed via opt.
761 static struct Qdisc *
762 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
763 u32 parent, u32 handle, struct nlattr **tca, int *errp)
766 struct nlattr *kind = tca[TCA_KIND];
768 struct Qdisc_ops *ops;
769 struct qdisc_size_table *stab;
771 ops = qdisc_lookup_ops(kind);
772 #ifdef CONFIG_MODULES
773 if (ops == NULL && kind != NULL) {
775 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
776 /* We dropped the RTNL semaphore in order to
777 * perform the module load. So, even if we
778 * succeeded in loading the module we have to
779 * tell the caller to replay the request. We
780 * indicate this using -EAGAIN.
781 * We replay the request because the device may
782 * go away in the mean time.
785 request_module("sch_%s", name);
787 ops = qdisc_lookup_ops(kind);
789 /* We will try again qdisc_lookup_ops,
790 * so don't keep a reference.
792 module_put(ops->owner);
804 sch = qdisc_alloc(dev_queue, ops);
810 sch->parent = parent;
812 if (handle == TC_H_INGRESS) {
813 sch->flags |= TCQ_F_INGRESS;
814 handle = TC_H_MAKE(TC_H_INGRESS, 0);
815 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
818 handle = qdisc_alloc_handle(dev);
823 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
826 sch->handle = handle;
828 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
830 stab = qdisc_get_stab(tca[TCA_STAB]);
838 spinlock_t *root_lock;
840 if ((sch->parent != TC_H_ROOT) &&
841 !(sch->flags & TCQ_F_INGRESS))
842 root_lock = qdisc_root_sleeping_lock(sch);
844 root_lock = qdisc_lock(sch);
846 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
847 root_lock, tca[TCA_RATE]);
850 * Any broken qdiscs that would require
851 * a ops->reset() here? The qdisc was never
852 * in action so it shouldn't be necessary.
865 qdisc_put_stab(sch->stab);
867 kfree((char *) sch - sch->padded);
869 module_put(ops->owner);
875 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
877 struct qdisc_size_table *stab = NULL;
880 if (tca[TCA_OPTIONS]) {
881 if (sch->ops->change == NULL)
883 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
889 stab = qdisc_get_stab(tca[TCA_STAB]);
891 return PTR_ERR(stab);
894 qdisc_put_stab(sch->stab);
898 gen_replace_estimator(&sch->bstats, &sch->rate_est,
899 qdisc_root_sleeping_lock(sch),
904 struct check_loop_arg
906 struct qdisc_walker w;
911 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
913 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
915 struct check_loop_arg arg;
917 if (q->ops->cl_ops == NULL)
920 arg.w.stop = arg.w.skip = arg.w.count = 0;
921 arg.w.fn = check_loop_fn;
924 q->ops->cl_ops->walk(q, &arg.w);
925 return arg.w.stop ? -ELOOP : 0;
929 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
932 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
933 struct check_loop_arg *arg = (struct check_loop_arg *)w;
935 leaf = cops->leaf(q, cl);
937 if (leaf == arg->p || arg->depth > 7)
939 return check_loop(leaf, arg->p, arg->depth + 1);
948 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
950 struct net *net = sock_net(skb->sk);
951 struct tcmsg *tcm = NLMSG_DATA(n);
952 struct nlattr *tca[TCA_MAX + 1];
953 struct net_device *dev;
954 u32 clid = tcm->tcm_parent;
955 struct Qdisc *q = NULL;
956 struct Qdisc *p = NULL;
959 if (net != &init_net)
962 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
965 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
970 if (clid != TC_H_ROOT) {
971 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
972 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
974 q = qdisc_leaf(p, clid);
975 } else { /* ingress */
976 q = dev->rx_queue.qdisc_sleeping;
979 struct netdev_queue *dev_queue;
980 dev_queue = netdev_get_tx_queue(dev, 0);
981 q = dev_queue->qdisc_sleeping;
986 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
989 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
993 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
996 if (n->nlmsg_type == RTM_DELQDISC) {
1001 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1004 qdisc_notify(skb, n, clid, NULL, q);
1010 Create/change qdisc.
1013 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1015 struct net *net = sock_net(skb->sk);
1017 struct nlattr *tca[TCA_MAX + 1];
1018 struct net_device *dev;
1020 struct Qdisc *q, *p;
1023 if (net != &init_net)
1027 /* Reinit, just in case something touches this. */
1028 tcm = NLMSG_DATA(n);
1029 clid = tcm->tcm_parent;
1032 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1035 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1040 if (clid != TC_H_ROOT) {
1041 if (clid != TC_H_INGRESS) {
1042 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1044 q = qdisc_leaf(p, clid);
1045 } else { /*ingress */
1046 q = dev->rx_queue.qdisc_sleeping;
1049 struct netdev_queue *dev_queue;
1050 dev_queue = netdev_get_tx_queue(dev, 0);
1051 q = dev_queue->qdisc_sleeping;
1054 /* It may be default qdisc, ignore it */
1055 if (q && q->handle == 0)
1058 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1059 if (tcm->tcm_handle) {
1060 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1062 if (TC_H_MIN(tcm->tcm_handle))
1064 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1065 goto create_n_graft;
1066 if (n->nlmsg_flags&NLM_F_EXCL)
1068 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1071 (p && check_loop(q, p, 0)))
1073 atomic_inc(&q->refcnt);
1077 goto create_n_graft;
1079 /* This magic test requires explanation.
1081 * We know, that some child q is already
1082 * attached to this parent and have choice:
1083 * either to change it or to create/graft new one.
1085 * 1. We are allowed to create/graft only
1086 * if CREATE and REPLACE flags are set.
1088 * 2. If EXCL is set, requestor wanted to say,
1089 * that qdisc tcm_handle is not expected
1090 * to exist, so that we choose create/graft too.
1092 * 3. The last case is when no flags are set.
1093 * Alas, it is sort of hole in API, we
1094 * cannot decide what to do unambiguously.
1095 * For now we select create/graft, if
1096 * user gave KIND, which does not match existing.
1098 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1099 (n->nlmsg_flags&NLM_F_REPLACE) &&
1100 ((n->nlmsg_flags&NLM_F_EXCL) ||
1102 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1103 goto create_n_graft;
1107 if (!tcm->tcm_handle)
1109 q = qdisc_lookup(dev, tcm->tcm_handle);
1112 /* Change qdisc parameters */
1115 if (n->nlmsg_flags&NLM_F_EXCL)
1117 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1119 err = qdisc_change(q, tca);
1121 qdisc_notify(skb, n, clid, NULL, q);
1125 if (!(n->nlmsg_flags&NLM_F_CREATE))
1127 if (clid == TC_H_INGRESS)
1128 q = qdisc_create(dev, &dev->rx_queue,
1129 tcm->tcm_parent, tcm->tcm_parent,
1132 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1133 tcm->tcm_parent, tcm->tcm_handle,
1142 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1152 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1153 u32 pid, u32 seq, u16 flags, int event)
1156 struct nlmsghdr *nlh;
1157 unsigned char *b = skb_tail_pointer(skb);
1160 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1161 tcm = NLMSG_DATA(nlh);
1162 tcm->tcm_family = AF_UNSPEC;
1165 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1166 tcm->tcm_parent = clid;
1167 tcm->tcm_handle = q->handle;
1168 tcm->tcm_info = atomic_read(&q->refcnt);
1169 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1170 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1171 goto nla_put_failure;
1172 q->qstats.qlen = q->q.qlen;
1174 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1175 goto nla_put_failure;
1177 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1178 qdisc_root_sleeping_lock(q), &d) < 0)
1179 goto nla_put_failure;
1181 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1182 goto nla_put_failure;
1184 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1185 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1186 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1187 goto nla_put_failure;
1189 if (gnet_stats_finish_copy(&d) < 0)
1190 goto nla_put_failure;
1192 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1201 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1202 u32 clid, struct Qdisc *old, struct Qdisc *new)
1204 struct sk_buff *skb;
1205 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1207 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1211 if (old && old->handle) {
1212 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1216 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1221 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1228 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1230 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1233 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1234 struct netlink_callback *cb,
1235 int *q_idx_p, int s_q_idx)
1237 int ret = 0, q_idx = *q_idx_p;
1244 if (q_idx < s_q_idx) {
1247 if (!tc_qdisc_dump_ignore(q) &&
1248 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1249 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1253 list_for_each_entry(q, &root->list, list) {
1254 if (q_idx < s_q_idx) {
1258 if (!tc_qdisc_dump_ignore(q) &&
1259 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1260 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1273 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1275 struct net *net = sock_net(skb->sk);
1278 struct net_device *dev;
1280 if (net != &init_net)
1283 s_idx = cb->args[0];
1284 s_q_idx = q_idx = cb->args[1];
1285 read_lock(&dev_base_lock);
1287 for_each_netdev(&init_net, dev) {
1288 struct netdev_queue *dev_queue;
1296 dev_queue = netdev_get_tx_queue(dev, 0);
1297 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1300 dev_queue = &dev->rx_queue;
1301 if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1309 read_unlock(&dev_base_lock);
1312 cb->args[1] = q_idx;
1319 /************************************************
1320 * Traffic classes manipulation. *
1321 ************************************************/
1325 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1327 struct net *net = sock_net(skb->sk);
1328 struct netdev_queue *dev_queue;
1329 struct tcmsg *tcm = NLMSG_DATA(n);
1330 struct nlattr *tca[TCA_MAX + 1];
1331 struct net_device *dev;
1332 struct Qdisc *q = NULL;
1333 const struct Qdisc_class_ops *cops;
1334 unsigned long cl = 0;
1335 unsigned long new_cl;
1336 u32 pid = tcm->tcm_parent;
1337 u32 clid = tcm->tcm_handle;
1338 u32 qid = TC_H_MAJ(clid);
1341 if (net != &init_net)
1344 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1347 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1352 parent == TC_H_UNSPEC - unspecified parent.
1353 parent == TC_H_ROOT - class is root, which has no parent.
1354 parent == X:0 - parent is root class.
1355 parent == X:Y - parent is a node in hierarchy.
1356 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1358 handle == 0:0 - generate handle from kernel pool.
1359 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1360 handle == X:Y - clear.
1361 handle == X:0 - root class.
1364 /* Step 1. Determine qdisc handle X:0 */
1366 dev_queue = netdev_get_tx_queue(dev, 0);
1367 if (pid != TC_H_ROOT) {
1368 u32 qid1 = TC_H_MAJ(pid);
1371 /* If both majors are known, they must be identical. */
1376 } else if (qid == 0)
1377 qid = dev_queue->qdisc_sleeping->handle;
1379 /* Now qid is genuine qdisc handle consistent
1380 both with parent and child.
1382 TC_H_MAJ(pid) still may be unspecified, complete it now.
1385 pid = TC_H_MAKE(qid, pid);
1388 qid = dev_queue->qdisc_sleeping->handle;
1391 /* OK. Locate qdisc */
1392 if ((q = qdisc_lookup(dev, qid)) == NULL)
1395 /* An check that it supports classes */
1396 cops = q->ops->cl_ops;
1400 /* Now try to get class */
1402 if (pid == TC_H_ROOT)
1405 clid = TC_H_MAKE(qid, clid);
1408 cl = cops->get(q, clid);
1412 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1415 switch (n->nlmsg_type) {
1418 if (n->nlmsg_flags&NLM_F_EXCL)
1422 err = cops->delete(q, cl);
1424 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1427 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1436 err = cops->change(q, clid, pid, tca, &new_cl);
1438 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1448 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1450 u32 pid, u32 seq, u16 flags, int event)
1453 struct nlmsghdr *nlh;
1454 unsigned char *b = skb_tail_pointer(skb);
1456 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1458 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1459 tcm = NLMSG_DATA(nlh);
1460 tcm->tcm_family = AF_UNSPEC;
1461 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1462 tcm->tcm_parent = q->handle;
1463 tcm->tcm_handle = q->handle;
1465 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1466 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1467 goto nla_put_failure;
1469 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1470 qdisc_root_sleeping_lock(q), &d) < 0)
1471 goto nla_put_failure;
1473 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1474 goto nla_put_failure;
1476 if (gnet_stats_finish_copy(&d) < 0)
1477 goto nla_put_failure;
1479 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1488 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1489 struct Qdisc *q, unsigned long cl, int event)
1491 struct sk_buff *skb;
1492 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1494 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1498 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1503 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1506 struct qdisc_dump_args
1508 struct qdisc_walker w;
1509 struct sk_buff *skb;
1510 struct netlink_callback *cb;
1513 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1515 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1517 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1518 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1521 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1522 struct tcmsg *tcm, struct netlink_callback *cb,
1525 struct qdisc_dump_args arg;
1527 if (tc_qdisc_dump_ignore(q) ||
1528 *t_p < s_t || !q->ops->cl_ops ||
1530 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1535 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1536 arg.w.fn = qdisc_class_dump;
1540 arg.w.skip = cb->args[1];
1542 q->ops->cl_ops->walk(q, &arg.w);
1543 cb->args[1] = arg.w.count;
1550 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1551 struct tcmsg *tcm, struct netlink_callback *cb,
1559 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1562 list_for_each_entry(q, &root->list, list) {
1563 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1570 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1572 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1573 struct net *net = sock_net(skb->sk);
1574 struct netdev_queue *dev_queue;
1575 struct net_device *dev;
1578 if (net != &init_net)
1581 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1583 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1589 dev_queue = netdev_get_tx_queue(dev, 0);
1590 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1593 dev_queue = &dev->rx_queue;
1594 if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1604 /* Main classifier routine: scans classifier chain attached
1605 to this qdisc, (optionally) tests for protocol and asks
1606 specific classifiers.
1608 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1609 struct tcf_result *res)
1611 __be16 protocol = skb->protocol;
1614 for (; tp; tp = tp->next) {
1615 if ((tp->protocol == protocol ||
1616 tp->protocol == htons(ETH_P_ALL)) &&
1617 (err = tp->classify(skb, tp, res)) >= 0) {
1618 #ifdef CONFIG_NET_CLS_ACT
1619 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1620 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1627 EXPORT_SYMBOL(tc_classify_compat);
1629 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1630 struct tcf_result *res)
1634 #ifdef CONFIG_NET_CLS_ACT
1635 struct tcf_proto *otp = tp;
1638 protocol = skb->protocol;
1640 err = tc_classify_compat(skb, tp, res);
1641 #ifdef CONFIG_NET_CLS_ACT
1642 if (err == TC_ACT_RECLASSIFY) {
1643 u32 verd = G_TC_VERD(skb->tc_verd);
1646 if (verd++ >= MAX_REC_LOOP) {
1647 printk("rule prio %u protocol %02x reclassify loop, "
1649 tp->prio&0xffff, ntohs(tp->protocol));
1652 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1658 EXPORT_SYMBOL(tc_classify);
1660 void tcf_destroy(struct tcf_proto *tp)
1662 tp->ops->destroy(tp);
1663 module_put(tp->ops->owner);
1667 void tcf_destroy_chain(struct tcf_proto **fl)
1669 struct tcf_proto *tp;
1671 while ((tp = *fl) != NULL) {
1676 EXPORT_SYMBOL(tcf_destroy_chain);
1678 #ifdef CONFIG_PROC_FS
1679 static int psched_show(struct seq_file *seq, void *v)
1683 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1684 seq_printf(seq, "%08x %08x %08x %08x\n",
1685 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1687 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1692 static int psched_open(struct inode *inode, struct file *file)
1694 return single_open(file, psched_show, PDE(inode)->data);
1697 static const struct file_operations psched_fops = {
1698 .owner = THIS_MODULE,
1699 .open = psched_open,
1701 .llseek = seq_lseek,
1702 .release = single_release,
1706 static int __init pktsched_init(void)
1708 register_qdisc(&pfifo_qdisc_ops);
1709 register_qdisc(&bfifo_qdisc_ops);
1710 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1712 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1713 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1714 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1715 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1716 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1717 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1722 subsys_initcall(pktsched_init);