]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/sched/sch_api.c
36ac0ec81ce0f78dc9cee52125ca3e683fef85bf
[karo-tx-linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 void qdisc_put_stab(struct qdisc_size_table *tab)
402 {
403         if (!tab)
404                 return;
405
406         spin_lock(&qdisc_stab_lock);
407
408         if (--tab->refcnt == 0) {
409                 list_del(&tab->list);
410                 kfree(tab);
411         }
412
413         spin_unlock(&qdisc_stab_lock);
414 }
415 EXPORT_SYMBOL(qdisc_put_stab);
416
417 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
418 {
419         struct nlattr *nest;
420
421         nest = nla_nest_start(skb, TCA_STAB);
422         if (nest == NULL)
423                 goto nla_put_failure;
424         NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
425         nla_nest_end(skb, nest);
426
427         return skb->len;
428
429 nla_put_failure:
430         return -1;
431 }
432
433 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
434 {
435         int pkt_len, slot;
436
437         pkt_len = skb->len + stab->szopts.overhead;
438         if (unlikely(!stab->szopts.tsize))
439                 goto out;
440
441         slot = pkt_len + stab->szopts.cell_align;
442         if (unlikely(slot < 0))
443                 slot = 0;
444
445         slot >>= stab->szopts.cell_log;
446         if (likely(slot < stab->szopts.tsize))
447                 pkt_len = stab->data[slot];
448         else
449                 pkt_len = stab->data[stab->szopts.tsize - 1] *
450                                 (slot / stab->szopts.tsize) +
451                                 stab->data[slot % stab->szopts.tsize];
452
453         pkt_len <<= stab->szopts.size_log;
454 out:
455         if (unlikely(pkt_len < 1))
456                 pkt_len = 1;
457         qdisc_skb_cb(skb)->pkt_len = pkt_len;
458 }
459 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
460
461 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
462 {
463         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
464                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
465                         txt, qdisc->ops->id, qdisc->handle >> 16);
466                 qdisc->flags |= TCQ_F_WARN_NONWC;
467         }
468 }
469 EXPORT_SYMBOL(qdisc_warn_nonwc);
470
471 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
472 {
473         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
474                                                  timer);
475
476         wd->qdisc->flags &= ~TCQ_F_THROTTLED;
477         __netif_schedule(qdisc_root(wd->qdisc));
478
479         return HRTIMER_NORESTART;
480 }
481
482 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
483 {
484         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
485         wd->timer.function = qdisc_watchdog;
486         wd->qdisc = qdisc;
487 }
488 EXPORT_SYMBOL(qdisc_watchdog_init);
489
490 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
491 {
492         ktime_t time;
493
494         if (test_bit(__QDISC_STATE_DEACTIVATED,
495                      &qdisc_root_sleeping(wd->qdisc)->state))
496                 return;
497
498         wd->qdisc->flags |= TCQ_F_THROTTLED;
499         time = ktime_set(0, 0);
500         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
501         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
502 }
503 EXPORT_SYMBOL(qdisc_watchdog_schedule);
504
505 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
506 {
507         hrtimer_cancel(&wd->timer);
508         wd->qdisc->flags &= ~TCQ_F_THROTTLED;
509 }
510 EXPORT_SYMBOL(qdisc_watchdog_cancel);
511
512 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
513 {
514         unsigned int size = n * sizeof(struct hlist_head), i;
515         struct hlist_head *h;
516
517         if (size <= PAGE_SIZE)
518                 h = kmalloc(size, GFP_KERNEL);
519         else
520                 h = (struct hlist_head *)
521                         __get_free_pages(GFP_KERNEL, get_order(size));
522
523         if (h != NULL) {
524                 for (i = 0; i < n; i++)
525                         INIT_HLIST_HEAD(&h[i]);
526         }
527         return h;
528 }
529
530 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
531 {
532         unsigned int size = n * sizeof(struct hlist_head);
533
534         if (size <= PAGE_SIZE)
535                 kfree(h);
536         else
537                 free_pages((unsigned long)h, get_order(size));
538 }
539
540 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
541 {
542         struct Qdisc_class_common *cl;
543         struct hlist_node *n, *next;
544         struct hlist_head *nhash, *ohash;
545         unsigned int nsize, nmask, osize;
546         unsigned int i, h;
547
548         /* Rehash when load factor exceeds 0.75 */
549         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
550                 return;
551         nsize = clhash->hashsize * 2;
552         nmask = nsize - 1;
553         nhash = qdisc_class_hash_alloc(nsize);
554         if (nhash == NULL)
555                 return;
556
557         ohash = clhash->hash;
558         osize = clhash->hashsize;
559
560         sch_tree_lock(sch);
561         for (i = 0; i < osize; i++) {
562                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
563                         h = qdisc_class_hash(cl->classid, nmask);
564                         hlist_add_head(&cl->hnode, &nhash[h]);
565                 }
566         }
567         clhash->hash     = nhash;
568         clhash->hashsize = nsize;
569         clhash->hashmask = nmask;
570         sch_tree_unlock(sch);
571
572         qdisc_class_hash_free(ohash, osize);
573 }
574 EXPORT_SYMBOL(qdisc_class_hash_grow);
575
576 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
577 {
578         unsigned int size = 4;
579
580         clhash->hash = qdisc_class_hash_alloc(size);
581         if (clhash->hash == NULL)
582                 return -ENOMEM;
583         clhash->hashsize  = size;
584         clhash->hashmask  = size - 1;
585         clhash->hashelems = 0;
586         return 0;
587 }
588 EXPORT_SYMBOL(qdisc_class_hash_init);
589
590 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
591 {
592         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
593 }
594 EXPORT_SYMBOL(qdisc_class_hash_destroy);
595
596 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
597                              struct Qdisc_class_common *cl)
598 {
599         unsigned int h;
600
601         INIT_HLIST_NODE(&cl->hnode);
602         h = qdisc_class_hash(cl->classid, clhash->hashmask);
603         hlist_add_head(&cl->hnode, &clhash->hash[h]);
604         clhash->hashelems++;
605 }
606 EXPORT_SYMBOL(qdisc_class_hash_insert);
607
608 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
609                              struct Qdisc_class_common *cl)
610 {
611         hlist_del(&cl->hnode);
612         clhash->hashelems--;
613 }
614 EXPORT_SYMBOL(qdisc_class_hash_remove);
615
616 /* Allocate an unique handle from space managed by kernel */
617
618 static u32 qdisc_alloc_handle(struct net_device *dev)
619 {
620         int i = 0x10000;
621         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
622
623         do {
624                 autohandle += TC_H_MAKE(0x10000U, 0);
625                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
626                         autohandle = TC_H_MAKE(0x80000000U, 0);
627         } while (qdisc_lookup(dev, autohandle) && --i > 0);
628
629         return i > 0 ? autohandle : 0;
630 }
631
632 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
633 {
634         const struct Qdisc_class_ops *cops;
635         unsigned long cl;
636         u32 parentid;
637
638         if (n == 0)
639                 return;
640         while ((parentid = sch->parent)) {
641                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
642                         return;
643
644                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
645                 if (sch == NULL) {
646                         WARN_ON(parentid != TC_H_ROOT);
647                         return;
648                 }
649                 cops = sch->ops->cl_ops;
650                 if (cops->qlen_notify) {
651                         cl = cops->get(sch, parentid);
652                         cops->qlen_notify(sch, cl);
653                         cops->put(sch, cl);
654                 }
655                 sch->q.qlen -= n;
656         }
657 }
658 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
659
660 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
661                                struct nlmsghdr *n, u32 clid,
662                                struct Qdisc *old, struct Qdisc *new)
663 {
664         if (new || old)
665                 qdisc_notify(net, skb, n, clid, old, new);
666
667         if (old)
668                 qdisc_destroy(old);
669 }
670
671 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
672  * to device "dev".
673  *
674  * When appropriate send a netlink notification using 'skb'
675  * and "n".
676  *
677  * On success, destroy old qdisc.
678  */
679
680 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
681                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
682                        struct Qdisc *new, struct Qdisc *old)
683 {
684         struct Qdisc *q = old;
685         struct net *net = dev_net(dev);
686         int err = 0;
687
688         if (parent == NULL) {
689                 unsigned int i, num_q, ingress;
690
691                 ingress = 0;
692                 num_q = dev->num_tx_queues;
693                 if ((q && q->flags & TCQ_F_INGRESS) ||
694                     (new && new->flags & TCQ_F_INGRESS)) {
695                         num_q = 1;
696                         ingress = 1;
697                         if (!dev_ingress_queue(dev))
698                                 return -ENOENT;
699                 }
700
701                 if (dev->flags & IFF_UP)
702                         dev_deactivate(dev);
703
704                 if (new && new->ops->attach) {
705                         new->ops->attach(new);
706                         num_q = 0;
707                 }
708
709                 for (i = 0; i < num_q; i++) {
710                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
711
712                         if (!ingress)
713                                 dev_queue = netdev_get_tx_queue(dev, i);
714
715                         old = dev_graft_qdisc(dev_queue, new);
716                         if (new && i > 0)
717                                 atomic_inc(&new->refcnt);
718
719                         if (!ingress)
720                                 qdisc_destroy(old);
721                 }
722
723                 if (!ingress) {
724                         notify_and_destroy(net, skb, n, classid,
725                                            dev->qdisc, new);
726                         if (new && !new->ops->attach)
727                                 atomic_inc(&new->refcnt);
728                         dev->qdisc = new ? : &noop_qdisc;
729                 } else {
730                         notify_and_destroy(net, skb, n, classid, old, new);
731                 }
732
733                 if (dev->flags & IFF_UP)
734                         dev_activate(dev);
735         } else {
736                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
737
738                 err = -EOPNOTSUPP;
739                 if (cops && cops->graft) {
740                         unsigned long cl = cops->get(parent, classid);
741                         if (cl) {
742                                 err = cops->graft(parent, cl, new, &old);
743                                 cops->put(parent, cl);
744                         } else
745                                 err = -ENOENT;
746                 }
747                 if (!err)
748                         notify_and_destroy(net, skb, n, classid, old, new);
749         }
750         return err;
751 }
752
753 /* lockdep annotation is needed for ingress; egress gets it only for name */
754 static struct lock_class_key qdisc_tx_lock;
755 static struct lock_class_key qdisc_rx_lock;
756
757 /*
758    Allocate and initialize new qdisc.
759
760    Parameters are passed via opt.
761  */
762
763 static struct Qdisc *
764 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
765              struct Qdisc *p, u32 parent, u32 handle,
766              struct nlattr **tca, int *errp)
767 {
768         int err;
769         struct nlattr *kind = tca[TCA_KIND];
770         struct Qdisc *sch;
771         struct Qdisc_ops *ops;
772         struct qdisc_size_table *stab;
773
774         ops = qdisc_lookup_ops(kind);
775 #ifdef CONFIG_MODULES
776         if (ops == NULL && kind != NULL) {
777                 char name[IFNAMSIZ];
778                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
779                         /* We dropped the RTNL semaphore in order to
780                          * perform the module load.  So, even if we
781                          * succeeded in loading the module we have to
782                          * tell the caller to replay the request.  We
783                          * indicate this using -EAGAIN.
784                          * We replay the request because the device may
785                          * go away in the mean time.
786                          */
787                         rtnl_unlock();
788                         request_module("sch_%s", name);
789                         rtnl_lock();
790                         ops = qdisc_lookup_ops(kind);
791                         if (ops != NULL) {
792                                 /* We will try again qdisc_lookup_ops,
793                                  * so don't keep a reference.
794                                  */
795                                 module_put(ops->owner);
796                                 err = -EAGAIN;
797                                 goto err_out;
798                         }
799                 }
800         }
801 #endif
802
803         err = -ENOENT;
804         if (ops == NULL)
805                 goto err_out;
806
807         sch = qdisc_alloc(dev_queue, ops);
808         if (IS_ERR(sch)) {
809                 err = PTR_ERR(sch);
810                 goto err_out2;
811         }
812
813         sch->parent = parent;
814
815         if (handle == TC_H_INGRESS) {
816                 sch->flags |= TCQ_F_INGRESS;
817                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
818                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
819         } else {
820                 if (handle == 0) {
821                         handle = qdisc_alloc_handle(dev);
822                         err = -ENOMEM;
823                         if (handle == 0)
824                                 goto err_out3;
825                 }
826                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
827         }
828
829         sch->handle = handle;
830
831         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
832                 if (tca[TCA_STAB]) {
833                         stab = qdisc_get_stab(tca[TCA_STAB]);
834                         if (IS_ERR(stab)) {
835                                 err = PTR_ERR(stab);
836                                 goto err_out4;
837                         }
838                         sch->stab = stab;
839                 }
840                 if (tca[TCA_RATE]) {
841                         spinlock_t *root_lock;
842
843                         err = -EOPNOTSUPP;
844                         if (sch->flags & TCQ_F_MQROOT)
845                                 goto err_out4;
846
847                         if ((sch->parent != TC_H_ROOT) &&
848                             !(sch->flags & TCQ_F_INGRESS) &&
849                             (!p || !(p->flags & TCQ_F_MQROOT)))
850                                 root_lock = qdisc_root_sleeping_lock(sch);
851                         else
852                                 root_lock = qdisc_lock(sch);
853
854                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
855                                                 root_lock, tca[TCA_RATE]);
856                         if (err)
857                                 goto err_out4;
858                 }
859
860                 qdisc_list_add(sch);
861
862                 return sch;
863         }
864 err_out3:
865         dev_put(dev);
866         kfree((char *) sch - sch->padded);
867 err_out2:
868         module_put(ops->owner);
869 err_out:
870         *errp = err;
871         return NULL;
872
873 err_out4:
874         /*
875          * Any broken qdiscs that would require a ops->reset() here?
876          * The qdisc was never in action so it shouldn't be necessary.
877          */
878         qdisc_put_stab(sch->stab);
879         if (ops->destroy)
880                 ops->destroy(sch);
881         goto err_out3;
882 }
883
884 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
885 {
886         struct qdisc_size_table *stab = NULL;
887         int err = 0;
888
889         if (tca[TCA_OPTIONS]) {
890                 if (sch->ops->change == NULL)
891                         return -EINVAL;
892                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
893                 if (err)
894                         return err;
895         }
896
897         if (tca[TCA_STAB]) {
898                 stab = qdisc_get_stab(tca[TCA_STAB]);
899                 if (IS_ERR(stab))
900                         return PTR_ERR(stab);
901         }
902
903         qdisc_put_stab(sch->stab);
904         sch->stab = stab;
905
906         if (tca[TCA_RATE]) {
907                 /* NB: ignores errors from replace_estimator
908                    because change can't be undone. */
909                 if (sch->flags & TCQ_F_MQROOT)
910                         goto out;
911                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
912                                             qdisc_root_sleeping_lock(sch),
913                                             tca[TCA_RATE]);
914         }
915 out:
916         return 0;
917 }
918
919 struct check_loop_arg {
920         struct qdisc_walker     w;
921         struct Qdisc            *p;
922         int                     depth;
923 };
924
925 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
926
927 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
928 {
929         struct check_loop_arg   arg;
930
931         if (q->ops->cl_ops == NULL)
932                 return 0;
933
934         arg.w.stop = arg.w.skip = arg.w.count = 0;
935         arg.w.fn = check_loop_fn;
936         arg.depth = depth;
937         arg.p = p;
938         q->ops->cl_ops->walk(q, &arg.w);
939         return arg.w.stop ? -ELOOP : 0;
940 }
941
942 static int
943 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
944 {
945         struct Qdisc *leaf;
946         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
947         struct check_loop_arg *arg = (struct check_loop_arg *)w;
948
949         leaf = cops->leaf(q, cl);
950         if (leaf) {
951                 if (leaf == arg->p || arg->depth > 7)
952                         return -ELOOP;
953                 return check_loop(leaf, arg->p, arg->depth + 1);
954         }
955         return 0;
956 }
957
958 /*
959  * Delete/get qdisc.
960  */
961
962 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
963 {
964         struct net *net = sock_net(skb->sk);
965         struct tcmsg *tcm = NLMSG_DATA(n);
966         struct nlattr *tca[TCA_MAX + 1];
967         struct net_device *dev;
968         u32 clid = tcm->tcm_parent;
969         struct Qdisc *q = NULL;
970         struct Qdisc *p = NULL;
971         int err;
972
973         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
974         if (!dev)
975                 return -ENODEV;
976
977         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
978         if (err < 0)
979                 return err;
980
981         if (clid) {
982                 if (clid != TC_H_ROOT) {
983                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
984                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
985                                 if (!p)
986                                         return -ENOENT;
987                                 q = qdisc_leaf(p, clid);
988                         } else if (dev_ingress_queue(dev)) {
989                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
990                         }
991                 } else {
992                         q = dev->qdisc;
993                 }
994                 if (!q)
995                         return -ENOENT;
996
997                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
998                         return -EINVAL;
999         } else {
1000                 q = qdisc_lookup(dev, tcm->tcm_handle);
1001                 if (!q)
1002                         return -ENOENT;
1003         }
1004
1005         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1006                 return -EINVAL;
1007
1008         if (n->nlmsg_type == RTM_DELQDISC) {
1009                 if (!clid)
1010                         return -EINVAL;
1011                 if (q->handle == 0)
1012                         return -ENOENT;
1013                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1014                 if (err != 0)
1015                         return err;
1016         } else {
1017                 qdisc_notify(net, skb, n, clid, NULL, q);
1018         }
1019         return 0;
1020 }
1021
1022 /*
1023  * Create/change qdisc.
1024  */
1025
1026 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1027 {
1028         struct net *net = sock_net(skb->sk);
1029         struct tcmsg *tcm;
1030         struct nlattr *tca[TCA_MAX + 1];
1031         struct net_device *dev;
1032         u32 clid;
1033         struct Qdisc *q, *p;
1034         int err;
1035
1036 replay:
1037         /* Reinit, just in case something touches this. */
1038         tcm = NLMSG_DATA(n);
1039         clid = tcm->tcm_parent;
1040         q = p = NULL;
1041
1042         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1043         if (!dev)
1044                 return -ENODEV;
1045
1046         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1047         if (err < 0)
1048                 return err;
1049
1050         if (clid) {
1051                 if (clid != TC_H_ROOT) {
1052                         if (clid != TC_H_INGRESS) {
1053                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1054                                 if (!p)
1055                                         return -ENOENT;
1056                                 q = qdisc_leaf(p, clid);
1057                         } else if (dev_ingress_queue_create(dev)) {
1058                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1059                         }
1060                 } else {
1061                         q = dev->qdisc;
1062                 }
1063
1064                 /* It may be default qdisc, ignore it */
1065                 if (q && q->handle == 0)
1066                         q = NULL;
1067
1068                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1069                         if (tcm->tcm_handle) {
1070                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1071                                         return -EEXIST;
1072                                 if (TC_H_MIN(tcm->tcm_handle))
1073                                         return -EINVAL;
1074                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1075                                 if (!q)
1076                                         goto create_n_graft;
1077                                 if (n->nlmsg_flags & NLM_F_EXCL)
1078                                         return -EEXIST;
1079                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1080                                         return -EINVAL;
1081                                 if (q == p ||
1082                                     (p && check_loop(q, p, 0)))
1083                                         return -ELOOP;
1084                                 atomic_inc(&q->refcnt);
1085                                 goto graft;
1086                         } else {
1087                                 if (!q)
1088                                         goto create_n_graft;
1089
1090                                 /* This magic test requires explanation.
1091                                  *
1092                                  *   We know, that some child q is already
1093                                  *   attached to this parent and have choice:
1094                                  *   either to change it or to create/graft new one.
1095                                  *
1096                                  *   1. We are allowed to create/graft only
1097                                  *   if CREATE and REPLACE flags are set.
1098                                  *
1099                                  *   2. If EXCL is set, requestor wanted to say,
1100                                  *   that qdisc tcm_handle is not expected
1101                                  *   to exist, so that we choose create/graft too.
1102                                  *
1103                                  *   3. The last case is when no flags are set.
1104                                  *   Alas, it is sort of hole in API, we
1105                                  *   cannot decide what to do unambiguously.
1106                                  *   For now we select create/graft, if
1107                                  *   user gave KIND, which does not match existing.
1108                                  */
1109                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1110                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1111                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1112                                      (tca[TCA_KIND] &&
1113                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1114                                         goto create_n_graft;
1115                         }
1116                 }
1117         } else {
1118                 if (!tcm->tcm_handle)
1119                         return -EINVAL;
1120                 q = qdisc_lookup(dev, tcm->tcm_handle);
1121         }
1122
1123         /* Change qdisc parameters */
1124         if (q == NULL)
1125                 return -ENOENT;
1126         if (n->nlmsg_flags & NLM_F_EXCL)
1127                 return -EEXIST;
1128         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1129                 return -EINVAL;
1130         err = qdisc_change(q, tca);
1131         if (err == 0)
1132                 qdisc_notify(net, skb, n, clid, NULL, q);
1133         return err;
1134
1135 create_n_graft:
1136         if (!(n->nlmsg_flags & NLM_F_CREATE))
1137                 return -ENOENT;
1138         if (clid == TC_H_INGRESS) {
1139                 if (dev_ingress_queue(dev))
1140                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1141                                          tcm->tcm_parent, tcm->tcm_parent,
1142                                          tca, &err);
1143                 else
1144                         err = -ENOENT;
1145         } else {
1146                 struct netdev_queue *dev_queue;
1147
1148                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1149                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1150                 else if (p)
1151                         dev_queue = p->dev_queue;
1152                 else
1153                         dev_queue = netdev_get_tx_queue(dev, 0);
1154
1155                 q = qdisc_create(dev, dev_queue, p,
1156                                  tcm->tcm_parent, tcm->tcm_handle,
1157                                  tca, &err);
1158         }
1159         if (q == NULL) {
1160                 if (err == -EAGAIN)
1161                         goto replay;
1162                 return err;
1163         }
1164
1165 graft:
1166         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1167         if (err) {
1168                 if (q)
1169                         qdisc_destroy(q);
1170                 return err;
1171         }
1172
1173         return 0;
1174 }
1175
1176 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1177                          u32 pid, u32 seq, u16 flags, int event)
1178 {
1179         struct tcmsg *tcm;
1180         struct nlmsghdr  *nlh;
1181         unsigned char *b = skb_tail_pointer(skb);
1182         struct gnet_dump d;
1183
1184         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1185         tcm = NLMSG_DATA(nlh);
1186         tcm->tcm_family = AF_UNSPEC;
1187         tcm->tcm__pad1 = 0;
1188         tcm->tcm__pad2 = 0;
1189         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1190         tcm->tcm_parent = clid;
1191         tcm->tcm_handle = q->handle;
1192         tcm->tcm_info = atomic_read(&q->refcnt);
1193         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1194         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1195                 goto nla_put_failure;
1196         q->qstats.qlen = q->q.qlen;
1197
1198         if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1199                 goto nla_put_failure;
1200
1201         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1202                                          qdisc_root_sleeping_lock(q), &d) < 0)
1203                 goto nla_put_failure;
1204
1205         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1206                 goto nla_put_failure;
1207
1208         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1209             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1210             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1211                 goto nla_put_failure;
1212
1213         if (gnet_stats_finish_copy(&d) < 0)
1214                 goto nla_put_failure;
1215
1216         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1217         return skb->len;
1218
1219 nlmsg_failure:
1220 nla_put_failure:
1221         nlmsg_trim(skb, b);
1222         return -1;
1223 }
1224
1225 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226 {
1227         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228 }
1229
1230 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1231                         struct nlmsghdr *n, u32 clid,
1232                         struct Qdisc *old, struct Qdisc *new)
1233 {
1234         struct sk_buff *skb;
1235         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1236
1237         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1238         if (!skb)
1239                 return -ENOBUFS;
1240
1241         if (old && !tc_qdisc_dump_ignore(old)) {
1242                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1243                                   0, RTM_DELQDISC) < 0)
1244                         goto err_out;
1245         }
1246         if (new && !tc_qdisc_dump_ignore(new)) {
1247                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1248                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1249                         goto err_out;
1250         }
1251
1252         if (skb->len)
1253                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1254                                       n->nlmsg_flags & NLM_F_ECHO);
1255
1256 err_out:
1257         kfree_skb(skb);
1258         return -EINVAL;
1259 }
1260
1261 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1262                               struct netlink_callback *cb,
1263                               int *q_idx_p, int s_q_idx)
1264 {
1265         int ret = 0, q_idx = *q_idx_p;
1266         struct Qdisc *q;
1267
1268         if (!root)
1269                 return 0;
1270
1271         q = root;
1272         if (q_idx < s_q_idx) {
1273                 q_idx++;
1274         } else {
1275                 if (!tc_qdisc_dump_ignore(q) &&
1276                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1277                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1278                         goto done;
1279                 q_idx++;
1280         }
1281         list_for_each_entry(q, &root->list, list) {
1282                 if (q_idx < s_q_idx) {
1283                         q_idx++;
1284                         continue;
1285                 }
1286                 if (!tc_qdisc_dump_ignore(q) &&
1287                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1288                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1289                         goto done;
1290                 q_idx++;
1291         }
1292
1293 out:
1294         *q_idx_p = q_idx;
1295         return ret;
1296 done:
1297         ret = -1;
1298         goto out;
1299 }
1300
1301 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1302 {
1303         struct net *net = sock_net(skb->sk);
1304         int idx, q_idx;
1305         int s_idx, s_q_idx;
1306         struct net_device *dev;
1307
1308         s_idx = cb->args[0];
1309         s_q_idx = q_idx = cb->args[1];
1310
1311         rcu_read_lock();
1312         idx = 0;
1313         for_each_netdev_rcu(net, dev) {
1314                 struct netdev_queue *dev_queue;
1315
1316                 if (idx < s_idx)
1317                         goto cont;
1318                 if (idx > s_idx)
1319                         s_q_idx = 0;
1320                 q_idx = 0;
1321
1322                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1323                         goto done;
1324
1325                 dev_queue = dev_ingress_queue(dev);
1326                 if (dev_queue &&
1327                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1328                                        &q_idx, s_q_idx) < 0)
1329                         goto done;
1330
1331 cont:
1332                 idx++;
1333         }
1334
1335 done:
1336         rcu_read_unlock();
1337
1338         cb->args[0] = idx;
1339         cb->args[1] = q_idx;
1340
1341         return skb->len;
1342 }
1343
1344
1345
1346 /************************************************
1347  *      Traffic classes manipulation.           *
1348  ************************************************/
1349
1350
1351
1352 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1353 {
1354         struct net *net = sock_net(skb->sk);
1355         struct tcmsg *tcm = NLMSG_DATA(n);
1356         struct nlattr *tca[TCA_MAX + 1];
1357         struct net_device *dev;
1358         struct Qdisc *q = NULL;
1359         const struct Qdisc_class_ops *cops;
1360         unsigned long cl = 0;
1361         unsigned long new_cl;
1362         u32 pid = tcm->tcm_parent;
1363         u32 clid = tcm->tcm_handle;
1364         u32 qid = TC_H_MAJ(clid);
1365         int err;
1366
1367         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1368         if (!dev)
1369                 return -ENODEV;
1370
1371         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1372         if (err < 0)
1373                 return err;
1374
1375         /*
1376            parent == TC_H_UNSPEC - unspecified parent.
1377            parent == TC_H_ROOT   - class is root, which has no parent.
1378            parent == X:0         - parent is root class.
1379            parent == X:Y         - parent is a node in hierarchy.
1380            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1381
1382            handle == 0:0         - generate handle from kernel pool.
1383            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1384            handle == X:Y         - clear.
1385            handle == X:0         - root class.
1386          */
1387
1388         /* Step 1. Determine qdisc handle X:0 */
1389
1390         if (pid != TC_H_ROOT) {
1391                 u32 qid1 = TC_H_MAJ(pid);
1392
1393                 if (qid && qid1) {
1394                         /* If both majors are known, they must be identical. */
1395                         if (qid != qid1)
1396                                 return -EINVAL;
1397                 } else if (qid1) {
1398                         qid = qid1;
1399                 } else if (qid == 0)
1400                         qid = dev->qdisc->handle;
1401
1402                 /* Now qid is genuine qdisc handle consistent
1403                  * both with parent and child.
1404                  *
1405                  * TC_H_MAJ(pid) still may be unspecified, complete it now.
1406                  */
1407                 if (pid)
1408                         pid = TC_H_MAKE(qid, pid);
1409         } else {
1410                 if (qid == 0)
1411                         qid = dev->qdisc->handle;
1412         }
1413
1414         /* OK. Locate qdisc */
1415         q = qdisc_lookup(dev, qid);
1416         if (!q)
1417                 return -ENOENT;
1418
1419         /* An check that it supports classes */
1420         cops = q->ops->cl_ops;
1421         if (cops == NULL)
1422                 return -EINVAL;
1423
1424         /* Now try to get class */
1425         if (clid == 0) {
1426                 if (pid == TC_H_ROOT)
1427                         clid = qid;
1428         } else
1429                 clid = TC_H_MAKE(qid, clid);
1430
1431         if (clid)
1432                 cl = cops->get(q, clid);
1433
1434         if (cl == 0) {
1435                 err = -ENOENT;
1436                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1437                     !(n->nlmsg_flags & NLM_F_CREATE))
1438                         goto out;
1439         } else {
1440                 switch (n->nlmsg_type) {
1441                 case RTM_NEWTCLASS:
1442                         err = -EEXIST;
1443                         if (n->nlmsg_flags & NLM_F_EXCL)
1444                                 goto out;
1445                         break;
1446                 case RTM_DELTCLASS:
1447                         err = -EOPNOTSUPP;
1448                         if (cops->delete)
1449                                 err = cops->delete(q, cl);
1450                         if (err == 0)
1451                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1452                         goto out;
1453                 case RTM_GETTCLASS:
1454                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1455                         goto out;
1456                 default:
1457                         err = -EINVAL;
1458                         goto out;
1459                 }
1460         }
1461
1462         new_cl = cl;
1463         err = -EOPNOTSUPP;
1464         if (cops->change)
1465                 err = cops->change(q, clid, pid, tca, &new_cl);
1466         if (err == 0)
1467                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1468
1469 out:
1470         if (cl)
1471                 cops->put(q, cl);
1472
1473         return err;
1474 }
1475
1476
1477 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1478                           unsigned long cl,
1479                           u32 pid, u32 seq, u16 flags, int event)
1480 {
1481         struct tcmsg *tcm;
1482         struct nlmsghdr  *nlh;
1483         unsigned char *b = skb_tail_pointer(skb);
1484         struct gnet_dump d;
1485         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1486
1487         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1488         tcm = NLMSG_DATA(nlh);
1489         tcm->tcm_family = AF_UNSPEC;
1490         tcm->tcm__pad1 = 0;
1491         tcm->tcm__pad2 = 0;
1492         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1493         tcm->tcm_parent = q->handle;
1494         tcm->tcm_handle = q->handle;
1495         tcm->tcm_info = 0;
1496         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1497         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1498                 goto nla_put_failure;
1499
1500         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1501                                          qdisc_root_sleeping_lock(q), &d) < 0)
1502                 goto nla_put_failure;
1503
1504         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1505                 goto nla_put_failure;
1506
1507         if (gnet_stats_finish_copy(&d) < 0)
1508                 goto nla_put_failure;
1509
1510         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1511         return skb->len;
1512
1513 nlmsg_failure:
1514 nla_put_failure:
1515         nlmsg_trim(skb, b);
1516         return -1;
1517 }
1518
1519 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1520                          struct nlmsghdr *n, struct Qdisc *q,
1521                          unsigned long cl, int event)
1522 {
1523         struct sk_buff *skb;
1524         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1525
1526         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1527         if (!skb)
1528                 return -ENOBUFS;
1529
1530         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1531                 kfree_skb(skb);
1532                 return -EINVAL;
1533         }
1534
1535         return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1536                               n->nlmsg_flags & NLM_F_ECHO);
1537 }
1538
1539 struct qdisc_dump_args {
1540         struct qdisc_walker     w;
1541         struct sk_buff          *skb;
1542         struct netlink_callback *cb;
1543 };
1544
1545 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1546 {
1547         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1548
1549         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1550                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1551 }
1552
1553 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1554                                 struct tcmsg *tcm, struct netlink_callback *cb,
1555                                 int *t_p, int s_t)
1556 {
1557         struct qdisc_dump_args arg;
1558
1559         if (tc_qdisc_dump_ignore(q) ||
1560             *t_p < s_t || !q->ops->cl_ops ||
1561             (tcm->tcm_parent &&
1562              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1563                 (*t_p)++;
1564                 return 0;
1565         }
1566         if (*t_p > s_t)
1567                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1568         arg.w.fn = qdisc_class_dump;
1569         arg.skb = skb;
1570         arg.cb = cb;
1571         arg.w.stop  = 0;
1572         arg.w.skip = cb->args[1];
1573         arg.w.count = 0;
1574         q->ops->cl_ops->walk(q, &arg.w);
1575         cb->args[1] = arg.w.count;
1576         if (arg.w.stop)
1577                 return -1;
1578         (*t_p)++;
1579         return 0;
1580 }
1581
1582 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1583                                struct tcmsg *tcm, struct netlink_callback *cb,
1584                                int *t_p, int s_t)
1585 {
1586         struct Qdisc *q;
1587
1588         if (!root)
1589                 return 0;
1590
1591         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1592                 return -1;
1593
1594         list_for_each_entry(q, &root->list, list) {
1595                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1596                         return -1;
1597         }
1598
1599         return 0;
1600 }
1601
1602 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1603 {
1604         struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1605         struct net *net = sock_net(skb->sk);
1606         struct netdev_queue *dev_queue;
1607         struct net_device *dev;
1608         int t, s_t;
1609
1610         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1611                 return 0;
1612         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1613         if (!dev)
1614                 return 0;
1615
1616         s_t = cb->args[0];
1617         t = 0;
1618
1619         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1620                 goto done;
1621
1622         dev_queue = dev_ingress_queue(dev);
1623         if (dev_queue &&
1624             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1625                                 &t, s_t) < 0)
1626                 goto done;
1627
1628 done:
1629         cb->args[0] = t;
1630
1631         dev_put(dev);
1632         return skb->len;
1633 }
1634
1635 /* Main classifier routine: scans classifier chain attached
1636  * to this qdisc, (optionally) tests for protocol and asks
1637  * specific classifiers.
1638  */
1639 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1640                        struct tcf_result *res)
1641 {
1642         __be16 protocol = skb->protocol;
1643         int err;
1644
1645         for (; tp; tp = tp->next) {
1646                 if (tp->protocol != protocol &&
1647                     tp->protocol != htons(ETH_P_ALL))
1648                         continue;
1649                 err = tp->classify(skb, tp, res);
1650
1651                 if (err >= 0) {
1652 #ifdef CONFIG_NET_CLS_ACT
1653                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1654                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1655 #endif
1656                         return err;
1657                 }
1658         }
1659         return -1;
1660 }
1661 EXPORT_SYMBOL(tc_classify_compat);
1662
1663 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1664                 struct tcf_result *res)
1665 {
1666         int err = 0;
1667         __be16 protocol;
1668 #ifdef CONFIG_NET_CLS_ACT
1669         struct tcf_proto *otp = tp;
1670 reclassify:
1671 #endif
1672         protocol = skb->protocol;
1673
1674         err = tc_classify_compat(skb, tp, res);
1675 #ifdef CONFIG_NET_CLS_ACT
1676         if (err == TC_ACT_RECLASSIFY) {
1677                 u32 verd = G_TC_VERD(skb->tc_verd);
1678                 tp = otp;
1679
1680                 if (verd++ >= MAX_REC_LOOP) {
1681                         if (net_ratelimit())
1682                                 pr_notice("%s: packet reclassify loop"
1683                                           " rule prio %u protocol %02x\n",
1684                                           tp->q->ops->id,
1685                                           tp->prio & 0xffff,
1686                                           ntohs(tp->protocol));
1687                         return TC_ACT_SHOT;
1688                 }
1689                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1690                 goto reclassify;
1691         }
1692 #endif
1693         return err;
1694 }
1695 EXPORT_SYMBOL(tc_classify);
1696
1697 void tcf_destroy(struct tcf_proto *tp)
1698 {
1699         tp->ops->destroy(tp);
1700         module_put(tp->ops->owner);
1701         kfree(tp);
1702 }
1703
1704 void tcf_destroy_chain(struct tcf_proto **fl)
1705 {
1706         struct tcf_proto *tp;
1707
1708         while ((tp = *fl) != NULL) {
1709                 *fl = tp->next;
1710                 tcf_destroy(tp);
1711         }
1712 }
1713 EXPORT_SYMBOL(tcf_destroy_chain);
1714
1715 #ifdef CONFIG_PROC_FS
1716 static int psched_show(struct seq_file *seq, void *v)
1717 {
1718         struct timespec ts;
1719
1720         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1721         seq_printf(seq, "%08x %08x %08x %08x\n",
1722                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1723                    1000000,
1724                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1725
1726         return 0;
1727 }
1728
1729 static int psched_open(struct inode *inode, struct file *file)
1730 {
1731         return single_open(file, psched_show, NULL);
1732 }
1733
1734 static const struct file_operations psched_fops = {
1735         .owner = THIS_MODULE,
1736         .open = psched_open,
1737         .read  = seq_read,
1738         .llseek = seq_lseek,
1739         .release = single_release,
1740 };
1741
1742 static int __net_init psched_net_init(struct net *net)
1743 {
1744         struct proc_dir_entry *e;
1745
1746         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1747         if (e == NULL)
1748                 return -ENOMEM;
1749
1750         return 0;
1751 }
1752
1753 static void __net_exit psched_net_exit(struct net *net)
1754 {
1755         proc_net_remove(net, "psched");
1756 }
1757 #else
1758 static int __net_init psched_net_init(struct net *net)
1759 {
1760         return 0;
1761 }
1762
1763 static void __net_exit psched_net_exit(struct net *net)
1764 {
1765 }
1766 #endif
1767
1768 static struct pernet_operations psched_net_ops = {
1769         .init = psched_net_init,
1770         .exit = psched_net_exit,
1771 };
1772
1773 static int __init pktsched_init(void)
1774 {
1775         int err;
1776
1777         err = register_pernet_subsys(&psched_net_ops);
1778         if (err) {
1779                 pr_err("pktsched_init: "
1780                        "cannot initialize per netns operations\n");
1781                 return err;
1782         }
1783
1784         register_qdisc(&pfifo_qdisc_ops);
1785         register_qdisc(&bfifo_qdisc_ops);
1786         register_qdisc(&pfifo_head_drop_qdisc_ops);
1787         register_qdisc(&mq_qdisc_ops);
1788
1789         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1790         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1791         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1792         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1793         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1794         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1795
1796         return 0;
1797 }
1798
1799 subsys_initcall(pktsched_init);