]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/sched/sch_api.c
rt2x00: rt2800pci: use module_pci_driver macro
[karo-tx-linux.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 static void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
277                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
278 }
279
280 void qdisc_list_del(struct Qdisc *q)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
283                 list_del(&q->list);
284 }
285 EXPORT_SYMBOL(qdisc_list_del);
286
287 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
288 {
289         struct Qdisc *q;
290
291         q = qdisc_match_from_root(dev->qdisc, handle);
292         if (q)
293                 goto out;
294
295         if (dev_ingress_queue(dev))
296                 q = qdisc_match_from_root(
297                         dev_ingress_queue(dev)->qdisc_sleeping,
298                         handle);
299 out:
300         return q;
301 }
302
303 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
304 {
305         unsigned long cl;
306         struct Qdisc *leaf;
307         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
308
309         if (cops == NULL)
310                 return NULL;
311         cl = cops->get(p, classid);
312
313         if (cl == 0)
314                 return NULL;
315         leaf = cops->leaf(p, cl);
316         cops->put(p, cl);
317         return leaf;
318 }
319
320 /* Find queueing discipline by name */
321
322 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
323 {
324         struct Qdisc_ops *q = NULL;
325
326         if (kind) {
327                 read_lock(&qdisc_mod_lock);
328                 for (q = qdisc_base; q; q = q->next) {
329                         if (nla_strcmp(kind, q->id) == 0) {
330                                 if (!try_module_get(q->owner))
331                                         q = NULL;
332                                 break;
333                         }
334                 }
335                 read_unlock(&qdisc_mod_lock);
336         }
337         return q;
338 }
339
340 /* The linklayer setting were not transferred from iproute2, in older
341  * versions, and the rate tables lookup systems have been dropped in
342  * the kernel. To keep backward compatible with older iproute2 tc
343  * utils, we detect the linklayer setting by detecting if the rate
344  * table were modified.
345  *
346  * For linklayer ATM table entries, the rate table will be aligned to
347  * 48 bytes, thus some table entries will contain the same value.  The
348  * mpu (min packet unit) is also encoded into the old rate table, thus
349  * starting from the mpu, we find low and high table entries for
350  * mapping this cell.  If these entries contain the same value, when
351  * the rate tables have been modified for linklayer ATM.
352  *
353  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
354  * and then roundup to the next cell, calc the table entry one below,
355  * and compare.
356  */
357 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
358 {
359         int low       = roundup(r->mpu, 48);
360         int high      = roundup(low+1, 48);
361         int cell_low  = low >> r->cell_log;
362         int cell_high = (high >> r->cell_log) - 1;
363
364         /* rtab is too inaccurate at rates > 100Mbit/s */
365         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
366                 pr_debug("TC linklayer: Giving up ATM detection\n");
367                 return TC_LINKLAYER_ETHERNET;
368         }
369
370         if ((cell_high > cell_low) && (cell_high < 256)
371             && (rtab[cell_low] == rtab[cell_high])) {
372                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
373                          cell_low, cell_high, rtab[cell_high]);
374                 return TC_LINKLAYER_ATM;
375         }
376         return TC_LINKLAYER_ETHERNET;
377 }
378
379 static struct qdisc_rate_table *qdisc_rtab_list;
380
381 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
382 {
383         struct qdisc_rate_table *rtab;
384
385         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
386             nla_len(tab) != TC_RTAB_SIZE)
387                 return NULL;
388
389         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
390                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
391                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
392                         rtab->refcnt++;
393                         return rtab;
394                 }
395         }
396
397         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
398         if (rtab) {
399                 rtab->rate = *r;
400                 rtab->refcnt = 1;
401                 memcpy(rtab->data, nla_data(tab), 1024);
402                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
403                         r->linklayer = __detect_linklayer(r, rtab->data);
404                 rtab->next = qdisc_rtab_list;
405                 qdisc_rtab_list = rtab;
406         }
407         return rtab;
408 }
409 EXPORT_SYMBOL(qdisc_get_rtab);
410
411 void qdisc_put_rtab(struct qdisc_rate_table *tab)
412 {
413         struct qdisc_rate_table *rtab, **rtabp;
414
415         if (!tab || --tab->refcnt)
416                 return;
417
418         for (rtabp = &qdisc_rtab_list;
419              (rtab = *rtabp) != NULL;
420              rtabp = &rtab->next) {
421                 if (rtab == tab) {
422                         *rtabp = rtab->next;
423                         kfree(rtab);
424                         return;
425                 }
426         }
427 }
428 EXPORT_SYMBOL(qdisc_put_rtab);
429
430 static LIST_HEAD(qdisc_stab_list);
431 static DEFINE_SPINLOCK(qdisc_stab_lock);
432
433 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
434         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
435         [TCA_STAB_DATA] = { .type = NLA_BINARY },
436 };
437
438 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
439 {
440         struct nlattr *tb[TCA_STAB_MAX + 1];
441         struct qdisc_size_table *stab;
442         struct tc_sizespec *s;
443         unsigned int tsize = 0;
444         u16 *tab = NULL;
445         int err;
446
447         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
448         if (err < 0)
449                 return ERR_PTR(err);
450         if (!tb[TCA_STAB_BASE])
451                 return ERR_PTR(-EINVAL);
452
453         s = nla_data(tb[TCA_STAB_BASE]);
454
455         if (s->tsize > 0) {
456                 if (!tb[TCA_STAB_DATA])
457                         return ERR_PTR(-EINVAL);
458                 tab = nla_data(tb[TCA_STAB_DATA]);
459                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
460         }
461
462         if (tsize != s->tsize || (!tab && tsize > 0))
463                 return ERR_PTR(-EINVAL);
464
465         spin_lock(&qdisc_stab_lock);
466
467         list_for_each_entry(stab, &qdisc_stab_list, list) {
468                 if (memcmp(&stab->szopts, s, sizeof(*s)))
469                         continue;
470                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
471                         continue;
472                 stab->refcnt++;
473                 spin_unlock(&qdisc_stab_lock);
474                 return stab;
475         }
476
477         spin_unlock(&qdisc_stab_lock);
478
479         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
480         if (!stab)
481                 return ERR_PTR(-ENOMEM);
482
483         stab->refcnt = 1;
484         stab->szopts = *s;
485         if (tsize > 0)
486                 memcpy(stab->data, tab, tsize * sizeof(u16));
487
488         spin_lock(&qdisc_stab_lock);
489         list_add_tail(&stab->list, &qdisc_stab_list);
490         spin_unlock(&qdisc_stab_lock);
491
492         return stab;
493 }
494
495 static void stab_kfree_rcu(struct rcu_head *head)
496 {
497         kfree(container_of(head, struct qdisc_size_table, rcu));
498 }
499
500 void qdisc_put_stab(struct qdisc_size_table *tab)
501 {
502         if (!tab)
503                 return;
504
505         spin_lock(&qdisc_stab_lock);
506
507         if (--tab->refcnt == 0) {
508                 list_del(&tab->list);
509                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
510         }
511
512         spin_unlock(&qdisc_stab_lock);
513 }
514 EXPORT_SYMBOL(qdisc_put_stab);
515
516 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
517 {
518         struct nlattr *nest;
519
520         nest = nla_nest_start(skb, TCA_STAB);
521         if (nest == NULL)
522                 goto nla_put_failure;
523         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
524                 goto nla_put_failure;
525         nla_nest_end(skb, nest);
526
527         return skb->len;
528
529 nla_put_failure:
530         return -1;
531 }
532
533 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
534 {
535         int pkt_len, slot;
536
537         pkt_len = skb->len + stab->szopts.overhead;
538         if (unlikely(!stab->szopts.tsize))
539                 goto out;
540
541         slot = pkt_len + stab->szopts.cell_align;
542         if (unlikely(slot < 0))
543                 slot = 0;
544
545         slot >>= stab->szopts.cell_log;
546         if (likely(slot < stab->szopts.tsize))
547                 pkt_len = stab->data[slot];
548         else
549                 pkt_len = stab->data[stab->szopts.tsize - 1] *
550                                 (slot / stab->szopts.tsize) +
551                                 stab->data[slot % stab->szopts.tsize];
552
553         pkt_len <<= stab->szopts.size_log;
554 out:
555         if (unlikely(pkt_len < 1))
556                 pkt_len = 1;
557         qdisc_skb_cb(skb)->pkt_len = pkt_len;
558 }
559 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
560
561 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
562 {
563         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
564                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
565                         txt, qdisc->ops->id, qdisc->handle >> 16);
566                 qdisc->flags |= TCQ_F_WARN_NONWC;
567         }
568 }
569 EXPORT_SYMBOL(qdisc_warn_nonwc);
570
571 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
572 {
573         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
574                                                  timer);
575
576         qdisc_unthrottled(wd->qdisc);
577         __netif_schedule(qdisc_root(wd->qdisc));
578
579         return HRTIMER_NORESTART;
580 }
581
582 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
583 {
584         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
585         wd->timer.function = qdisc_watchdog;
586         wd->qdisc = qdisc;
587 }
588 EXPORT_SYMBOL(qdisc_watchdog_init);
589
590 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
591 {
592         if (test_bit(__QDISC_STATE_DEACTIVATED,
593                      &qdisc_root_sleeping(wd->qdisc)->state))
594                 return;
595
596         qdisc_throttled(wd->qdisc);
597
598         hrtimer_start(&wd->timer,
599                       ns_to_ktime(expires),
600                       HRTIMER_MODE_ABS);
601 }
602 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
603
604 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
605 {
606         hrtimer_cancel(&wd->timer);
607         qdisc_unthrottled(wd->qdisc);
608 }
609 EXPORT_SYMBOL(qdisc_watchdog_cancel);
610
611 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
612 {
613         unsigned int size = n * sizeof(struct hlist_head), i;
614         struct hlist_head *h;
615
616         if (size <= PAGE_SIZE)
617                 h = kmalloc(size, GFP_KERNEL);
618         else
619                 h = (struct hlist_head *)
620                         __get_free_pages(GFP_KERNEL, get_order(size));
621
622         if (h != NULL) {
623                 for (i = 0; i < n; i++)
624                         INIT_HLIST_HEAD(&h[i]);
625         }
626         return h;
627 }
628
629 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
630 {
631         unsigned int size = n * sizeof(struct hlist_head);
632
633         if (size <= PAGE_SIZE)
634                 kfree(h);
635         else
636                 free_pages((unsigned long)h, get_order(size));
637 }
638
639 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
640 {
641         struct Qdisc_class_common *cl;
642         struct hlist_node *next;
643         struct hlist_head *nhash, *ohash;
644         unsigned int nsize, nmask, osize;
645         unsigned int i, h;
646
647         /* Rehash when load factor exceeds 0.75 */
648         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
649                 return;
650         nsize = clhash->hashsize * 2;
651         nmask = nsize - 1;
652         nhash = qdisc_class_hash_alloc(nsize);
653         if (nhash == NULL)
654                 return;
655
656         ohash = clhash->hash;
657         osize = clhash->hashsize;
658
659         sch_tree_lock(sch);
660         for (i = 0; i < osize; i++) {
661                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
662                         h = qdisc_class_hash(cl->classid, nmask);
663                         hlist_add_head(&cl->hnode, &nhash[h]);
664                 }
665         }
666         clhash->hash     = nhash;
667         clhash->hashsize = nsize;
668         clhash->hashmask = nmask;
669         sch_tree_unlock(sch);
670
671         qdisc_class_hash_free(ohash, osize);
672 }
673 EXPORT_SYMBOL(qdisc_class_hash_grow);
674
675 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
676 {
677         unsigned int size = 4;
678
679         clhash->hash = qdisc_class_hash_alloc(size);
680         if (clhash->hash == NULL)
681                 return -ENOMEM;
682         clhash->hashsize  = size;
683         clhash->hashmask  = size - 1;
684         clhash->hashelems = 0;
685         return 0;
686 }
687 EXPORT_SYMBOL(qdisc_class_hash_init);
688
689 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
690 {
691         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
692 }
693 EXPORT_SYMBOL(qdisc_class_hash_destroy);
694
695 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
696                              struct Qdisc_class_common *cl)
697 {
698         unsigned int h;
699
700         INIT_HLIST_NODE(&cl->hnode);
701         h = qdisc_class_hash(cl->classid, clhash->hashmask);
702         hlist_add_head(&cl->hnode, &clhash->hash[h]);
703         clhash->hashelems++;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_insert);
706
707 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
708                              struct Qdisc_class_common *cl)
709 {
710         hlist_del(&cl->hnode);
711         clhash->hashelems--;
712 }
713 EXPORT_SYMBOL(qdisc_class_hash_remove);
714
715 /* Allocate an unique handle from space managed by kernel
716  * Possible range is [8000-FFFF]:0000 (0x8000 values)
717  */
718 static u32 qdisc_alloc_handle(struct net_device *dev)
719 {
720         int i = 0x8000;
721         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
722
723         do {
724                 autohandle += TC_H_MAKE(0x10000U, 0);
725                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
726                         autohandle = TC_H_MAKE(0x80000000U, 0);
727                 if (!qdisc_lookup(dev, autohandle))
728                         return autohandle;
729                 cond_resched();
730         } while (--i > 0);
731
732         return 0;
733 }
734
735 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
736 {
737         const struct Qdisc_class_ops *cops;
738         unsigned long cl;
739         u32 parentid;
740
741         if (n == 0)
742                 return;
743         while ((parentid = sch->parent)) {
744                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
745                         return;
746
747                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
748                 if (sch == NULL) {
749                         WARN_ON(parentid != TC_H_ROOT);
750                         return;
751                 }
752                 cops = sch->ops->cl_ops;
753                 if (cops->qlen_notify) {
754                         cl = cops->get(sch, parentid);
755                         cops->qlen_notify(sch, cl);
756                         cops->put(sch, cl);
757                 }
758                 sch->q.qlen -= n;
759         }
760 }
761 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
762
763 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
764                                struct nlmsghdr *n, u32 clid,
765                                struct Qdisc *old, struct Qdisc *new)
766 {
767         if (new || old)
768                 qdisc_notify(net, skb, n, clid, old, new);
769
770         if (old)
771                 qdisc_destroy(old);
772 }
773
774 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
775  * to device "dev".
776  *
777  * When appropriate send a netlink notification using 'skb'
778  * and "n".
779  *
780  * On success, destroy old qdisc.
781  */
782
783 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
784                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
785                        struct Qdisc *new, struct Qdisc *old)
786 {
787         struct Qdisc *q = old;
788         struct net *net = dev_net(dev);
789         int err = 0;
790
791         if (parent == NULL) {
792                 unsigned int i, num_q, ingress;
793
794                 ingress = 0;
795                 num_q = dev->num_tx_queues;
796                 if ((q && q->flags & TCQ_F_INGRESS) ||
797                     (new && new->flags & TCQ_F_INGRESS)) {
798                         num_q = 1;
799                         ingress = 1;
800                         if (!dev_ingress_queue(dev))
801                                 return -ENOENT;
802                 }
803
804                 if (dev->flags & IFF_UP)
805                         dev_deactivate(dev);
806
807                 if (new && new->ops->attach) {
808                         new->ops->attach(new);
809                         num_q = 0;
810                 }
811
812                 for (i = 0; i < num_q; i++) {
813                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
814
815                         if (!ingress)
816                                 dev_queue = netdev_get_tx_queue(dev, i);
817
818                         old = dev_graft_qdisc(dev_queue, new);
819                         if (new && i > 0)
820                                 atomic_inc(&new->refcnt);
821
822                         if (!ingress)
823                                 qdisc_destroy(old);
824                 }
825
826                 if (!ingress) {
827                         notify_and_destroy(net, skb, n, classid,
828                                            dev->qdisc, new);
829                         if (new && !new->ops->attach)
830                                 atomic_inc(&new->refcnt);
831                         dev->qdisc = new ? : &noop_qdisc;
832                 } else {
833                         notify_and_destroy(net, skb, n, classid, old, new);
834                 }
835
836                 if (dev->flags & IFF_UP)
837                         dev_activate(dev);
838         } else {
839                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
840
841                 err = -EOPNOTSUPP;
842                 if (cops && cops->graft) {
843                         unsigned long cl = cops->get(parent, classid);
844                         if (cl) {
845                                 err = cops->graft(parent, cl, new, &old);
846                                 cops->put(parent, cl);
847                         } else
848                                 err = -ENOENT;
849                 }
850                 if (!err)
851                         notify_and_destroy(net, skb, n, classid, old, new);
852         }
853         return err;
854 }
855
856 /* lockdep annotation is needed for ingress; egress gets it only for name */
857 static struct lock_class_key qdisc_tx_lock;
858 static struct lock_class_key qdisc_rx_lock;
859
860 /*
861    Allocate and initialize new qdisc.
862
863    Parameters are passed via opt.
864  */
865
866 static struct Qdisc *
867 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
868              struct Qdisc *p, u32 parent, u32 handle,
869              struct nlattr **tca, int *errp)
870 {
871         int err;
872         struct nlattr *kind = tca[TCA_KIND];
873         struct Qdisc *sch;
874         struct Qdisc_ops *ops;
875         struct qdisc_size_table *stab;
876
877         ops = qdisc_lookup_ops(kind);
878 #ifdef CONFIG_MODULES
879         if (ops == NULL && kind != NULL) {
880                 char name[IFNAMSIZ];
881                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
882                         /* We dropped the RTNL semaphore in order to
883                          * perform the module load.  So, even if we
884                          * succeeded in loading the module we have to
885                          * tell the caller to replay the request.  We
886                          * indicate this using -EAGAIN.
887                          * We replay the request because the device may
888                          * go away in the mean time.
889                          */
890                         rtnl_unlock();
891                         request_module("sch_%s", name);
892                         rtnl_lock();
893                         ops = qdisc_lookup_ops(kind);
894                         if (ops != NULL) {
895                                 /* We will try again qdisc_lookup_ops,
896                                  * so don't keep a reference.
897                                  */
898                                 module_put(ops->owner);
899                                 err = -EAGAIN;
900                                 goto err_out;
901                         }
902                 }
903         }
904 #endif
905
906         err = -ENOENT;
907         if (ops == NULL)
908                 goto err_out;
909
910         sch = qdisc_alloc(dev_queue, ops);
911         if (IS_ERR(sch)) {
912                 err = PTR_ERR(sch);
913                 goto err_out2;
914         }
915
916         sch->parent = parent;
917
918         if (handle == TC_H_INGRESS) {
919                 sch->flags |= TCQ_F_INGRESS;
920                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
921                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
922         } else {
923                 if (handle == 0) {
924                         handle = qdisc_alloc_handle(dev);
925                         err = -ENOMEM;
926                         if (handle == 0)
927                                 goto err_out3;
928                 }
929                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
930                 if (!netif_is_multiqueue(dev))
931                         sch->flags |= TCQ_F_ONETXQUEUE;
932         }
933
934         sch->handle = handle;
935
936         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
937                 if (tca[TCA_STAB]) {
938                         stab = qdisc_get_stab(tca[TCA_STAB]);
939                         if (IS_ERR(stab)) {
940                                 err = PTR_ERR(stab);
941                                 goto err_out4;
942                         }
943                         rcu_assign_pointer(sch->stab, stab);
944                 }
945                 if (tca[TCA_RATE]) {
946                         spinlock_t *root_lock;
947
948                         err = -EOPNOTSUPP;
949                         if (sch->flags & TCQ_F_MQROOT)
950                                 goto err_out4;
951
952                         if ((sch->parent != TC_H_ROOT) &&
953                             !(sch->flags & TCQ_F_INGRESS) &&
954                             (!p || !(p->flags & TCQ_F_MQROOT)))
955                                 root_lock = qdisc_root_sleeping_lock(sch);
956                         else
957                                 root_lock = qdisc_lock(sch);
958
959                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
960                                                 root_lock, tca[TCA_RATE]);
961                         if (err)
962                                 goto err_out4;
963                 }
964
965                 qdisc_list_add(sch);
966
967                 return sch;
968         }
969 err_out3:
970         dev_put(dev);
971         kfree((char *) sch - sch->padded);
972 err_out2:
973         module_put(ops->owner);
974 err_out:
975         *errp = err;
976         return NULL;
977
978 err_out4:
979         /*
980          * Any broken qdiscs that would require a ops->reset() here?
981          * The qdisc was never in action so it shouldn't be necessary.
982          */
983         qdisc_put_stab(rtnl_dereference(sch->stab));
984         if (ops->destroy)
985                 ops->destroy(sch);
986         goto err_out3;
987 }
988
989 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
990 {
991         struct qdisc_size_table *ostab, *stab = NULL;
992         int err = 0;
993
994         if (tca[TCA_OPTIONS]) {
995                 if (sch->ops->change == NULL)
996                         return -EINVAL;
997                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
998                 if (err)
999                         return err;
1000         }
1001
1002         if (tca[TCA_STAB]) {
1003                 stab = qdisc_get_stab(tca[TCA_STAB]);
1004                 if (IS_ERR(stab))
1005                         return PTR_ERR(stab);
1006         }
1007
1008         ostab = rtnl_dereference(sch->stab);
1009         rcu_assign_pointer(sch->stab, stab);
1010         qdisc_put_stab(ostab);
1011
1012         if (tca[TCA_RATE]) {
1013                 /* NB: ignores errors from replace_estimator
1014                    because change can't be undone. */
1015                 if (sch->flags & TCQ_F_MQROOT)
1016                         goto out;
1017                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
1018                                             qdisc_root_sleeping_lock(sch),
1019                                             tca[TCA_RATE]);
1020         }
1021 out:
1022         return 0;
1023 }
1024
1025 struct check_loop_arg {
1026         struct qdisc_walker     w;
1027         struct Qdisc            *p;
1028         int                     depth;
1029 };
1030
1031 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1032
1033 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1034 {
1035         struct check_loop_arg   arg;
1036
1037         if (q->ops->cl_ops == NULL)
1038                 return 0;
1039
1040         arg.w.stop = arg.w.skip = arg.w.count = 0;
1041         arg.w.fn = check_loop_fn;
1042         arg.depth = depth;
1043         arg.p = p;
1044         q->ops->cl_ops->walk(q, &arg.w);
1045         return arg.w.stop ? -ELOOP : 0;
1046 }
1047
1048 static int
1049 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1050 {
1051         struct Qdisc *leaf;
1052         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1053         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1054
1055         leaf = cops->leaf(q, cl);
1056         if (leaf) {
1057                 if (leaf == arg->p || arg->depth > 7)
1058                         return -ELOOP;
1059                 return check_loop(leaf, arg->p, arg->depth + 1);
1060         }
1061         return 0;
1062 }
1063
1064 /*
1065  * Delete/get qdisc.
1066  */
1067
1068 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1069 {
1070         struct net *net = sock_net(skb->sk);
1071         struct tcmsg *tcm = nlmsg_data(n);
1072         struct nlattr *tca[TCA_MAX + 1];
1073         struct net_device *dev;
1074         u32 clid;
1075         struct Qdisc *q = NULL;
1076         struct Qdisc *p = NULL;
1077         int err;
1078
1079         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1080                 return -EPERM;
1081
1082         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1083         if (err < 0)
1084                 return err;
1085
1086         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1087         if (!dev)
1088                 return -ENODEV;
1089
1090         clid = tcm->tcm_parent;
1091         if (clid) {
1092                 if (clid != TC_H_ROOT) {
1093                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1094                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1095                                 if (!p)
1096                                         return -ENOENT;
1097                                 q = qdisc_leaf(p, clid);
1098                         } else if (dev_ingress_queue(dev)) {
1099                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1100                         }
1101                 } else {
1102                         q = dev->qdisc;
1103                 }
1104                 if (!q)
1105                         return -ENOENT;
1106
1107                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1108                         return -EINVAL;
1109         } else {
1110                 q = qdisc_lookup(dev, tcm->tcm_handle);
1111                 if (!q)
1112                         return -ENOENT;
1113         }
1114
1115         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1116                 return -EINVAL;
1117
1118         if (n->nlmsg_type == RTM_DELQDISC) {
1119                 if (!clid)
1120                         return -EINVAL;
1121                 if (q->handle == 0)
1122                         return -ENOENT;
1123                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1124                 if (err != 0)
1125                         return err;
1126         } else {
1127                 qdisc_notify(net, skb, n, clid, NULL, q);
1128         }
1129         return 0;
1130 }
1131
1132 /*
1133  * Create/change qdisc.
1134  */
1135
1136 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1137 {
1138         struct net *net = sock_net(skb->sk);
1139         struct tcmsg *tcm;
1140         struct nlattr *tca[TCA_MAX + 1];
1141         struct net_device *dev;
1142         u32 clid;
1143         struct Qdisc *q, *p;
1144         int err;
1145
1146         if (!capable(CAP_NET_ADMIN))
1147                 return -EPERM;
1148
1149 replay:
1150         /* Reinit, just in case something touches this. */
1151         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1152         if (err < 0)
1153                 return err;
1154
1155         tcm = nlmsg_data(n);
1156         clid = tcm->tcm_parent;
1157         q = p = NULL;
1158
1159         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1160         if (!dev)
1161                 return -ENODEV;
1162
1163
1164         if (clid) {
1165                 if (clid != TC_H_ROOT) {
1166                         if (clid != TC_H_INGRESS) {
1167                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1168                                 if (!p)
1169                                         return -ENOENT;
1170                                 q = qdisc_leaf(p, clid);
1171                         } else if (dev_ingress_queue_create(dev)) {
1172                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1173                         }
1174                 } else {
1175                         q = dev->qdisc;
1176                 }
1177
1178                 /* It may be default qdisc, ignore it */
1179                 if (q && q->handle == 0)
1180                         q = NULL;
1181
1182                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1183                         if (tcm->tcm_handle) {
1184                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1185                                         return -EEXIST;
1186                                 if (TC_H_MIN(tcm->tcm_handle))
1187                                         return -EINVAL;
1188                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1189                                 if (!q)
1190                                         goto create_n_graft;
1191                                 if (n->nlmsg_flags & NLM_F_EXCL)
1192                                         return -EEXIST;
1193                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1194                                         return -EINVAL;
1195                                 if (q == p ||
1196                                     (p && check_loop(q, p, 0)))
1197                                         return -ELOOP;
1198                                 atomic_inc(&q->refcnt);
1199                                 goto graft;
1200                         } else {
1201                                 if (!q)
1202                                         goto create_n_graft;
1203
1204                                 /* This magic test requires explanation.
1205                                  *
1206                                  *   We know, that some child q is already
1207                                  *   attached to this parent and have choice:
1208                                  *   either to change it or to create/graft new one.
1209                                  *
1210                                  *   1. We are allowed to create/graft only
1211                                  *   if CREATE and REPLACE flags are set.
1212                                  *
1213                                  *   2. If EXCL is set, requestor wanted to say,
1214                                  *   that qdisc tcm_handle is not expected
1215                                  *   to exist, so that we choose create/graft too.
1216                                  *
1217                                  *   3. The last case is when no flags are set.
1218                                  *   Alas, it is sort of hole in API, we
1219                                  *   cannot decide what to do unambiguously.
1220                                  *   For now we select create/graft, if
1221                                  *   user gave KIND, which does not match existing.
1222                                  */
1223                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1224                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1225                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1226                                      (tca[TCA_KIND] &&
1227                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1228                                         goto create_n_graft;
1229                         }
1230                 }
1231         } else {
1232                 if (!tcm->tcm_handle)
1233                         return -EINVAL;
1234                 q = qdisc_lookup(dev, tcm->tcm_handle);
1235         }
1236
1237         /* Change qdisc parameters */
1238         if (q == NULL)
1239                 return -ENOENT;
1240         if (n->nlmsg_flags & NLM_F_EXCL)
1241                 return -EEXIST;
1242         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1243                 return -EINVAL;
1244         err = qdisc_change(q, tca);
1245         if (err == 0)
1246                 qdisc_notify(net, skb, n, clid, NULL, q);
1247         return err;
1248
1249 create_n_graft:
1250         if (!(n->nlmsg_flags & NLM_F_CREATE))
1251                 return -ENOENT;
1252         if (clid == TC_H_INGRESS) {
1253                 if (dev_ingress_queue(dev))
1254                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1255                                          tcm->tcm_parent, tcm->tcm_parent,
1256                                          tca, &err);
1257                 else
1258                         err = -ENOENT;
1259         } else {
1260                 struct netdev_queue *dev_queue;
1261
1262                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1263                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1264                 else if (p)
1265                         dev_queue = p->dev_queue;
1266                 else
1267                         dev_queue = netdev_get_tx_queue(dev, 0);
1268
1269                 q = qdisc_create(dev, dev_queue, p,
1270                                  tcm->tcm_parent, tcm->tcm_handle,
1271                                  tca, &err);
1272         }
1273         if (q == NULL) {
1274                 if (err == -EAGAIN)
1275                         goto replay;
1276                 return err;
1277         }
1278
1279 graft:
1280         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1281         if (err) {
1282                 if (q)
1283                         qdisc_destroy(q);
1284                 return err;
1285         }
1286
1287         return 0;
1288 }
1289
1290 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1291                          u32 portid, u32 seq, u16 flags, int event)
1292 {
1293         struct tcmsg *tcm;
1294         struct nlmsghdr  *nlh;
1295         unsigned char *b = skb_tail_pointer(skb);
1296         struct gnet_dump d;
1297         struct qdisc_size_table *stab;
1298
1299         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1300         if (!nlh)
1301                 goto out_nlmsg_trim;
1302         tcm = nlmsg_data(nlh);
1303         tcm->tcm_family = AF_UNSPEC;
1304         tcm->tcm__pad1 = 0;
1305         tcm->tcm__pad2 = 0;
1306         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1307         tcm->tcm_parent = clid;
1308         tcm->tcm_handle = q->handle;
1309         tcm->tcm_info = atomic_read(&q->refcnt);
1310         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1311                 goto nla_put_failure;
1312         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1313                 goto nla_put_failure;
1314         q->qstats.qlen = q->q.qlen;
1315
1316         stab = rtnl_dereference(q->stab);
1317         if (stab && qdisc_dump_stab(skb, stab) < 0)
1318                 goto nla_put_failure;
1319
1320         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1321                                          qdisc_root_sleeping_lock(q), &d) < 0)
1322                 goto nla_put_failure;
1323
1324         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1325                 goto nla_put_failure;
1326
1327         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1328             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1329             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1330                 goto nla_put_failure;
1331
1332         if (gnet_stats_finish_copy(&d) < 0)
1333                 goto nla_put_failure;
1334
1335         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1336         return skb->len;
1337
1338 out_nlmsg_trim:
1339 nla_put_failure:
1340         nlmsg_trim(skb, b);
1341         return -1;
1342 }
1343
1344 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1345 {
1346         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1347 }
1348
1349 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1350                         struct nlmsghdr *n, u32 clid,
1351                         struct Qdisc *old, struct Qdisc *new)
1352 {
1353         struct sk_buff *skb;
1354         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1355
1356         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1357         if (!skb)
1358                 return -ENOBUFS;
1359
1360         if (old && !tc_qdisc_dump_ignore(old)) {
1361                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1362                                   0, RTM_DELQDISC) < 0)
1363                         goto err_out;
1364         }
1365         if (new && !tc_qdisc_dump_ignore(new)) {
1366                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1367                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1368                         goto err_out;
1369         }
1370
1371         if (skb->len)
1372                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1373                                       n->nlmsg_flags & NLM_F_ECHO);
1374
1375 err_out:
1376         kfree_skb(skb);
1377         return -EINVAL;
1378 }
1379
1380 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1381                               struct netlink_callback *cb,
1382                               int *q_idx_p, int s_q_idx)
1383 {
1384         int ret = 0, q_idx = *q_idx_p;
1385         struct Qdisc *q;
1386
1387         if (!root)
1388                 return 0;
1389
1390         q = root;
1391         if (q_idx < s_q_idx) {
1392                 q_idx++;
1393         } else {
1394                 if (!tc_qdisc_dump_ignore(q) &&
1395                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1396                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1397                         goto done;
1398                 q_idx++;
1399         }
1400         list_for_each_entry(q, &root->list, list) {
1401                 if (q_idx < s_q_idx) {
1402                         q_idx++;
1403                         continue;
1404                 }
1405                 if (!tc_qdisc_dump_ignore(q) &&
1406                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1407                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1408                         goto done;
1409                 q_idx++;
1410         }
1411
1412 out:
1413         *q_idx_p = q_idx;
1414         return ret;
1415 done:
1416         ret = -1;
1417         goto out;
1418 }
1419
1420 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1421 {
1422         struct net *net = sock_net(skb->sk);
1423         int idx, q_idx;
1424         int s_idx, s_q_idx;
1425         struct net_device *dev;
1426
1427         s_idx = cb->args[0];
1428         s_q_idx = q_idx = cb->args[1];
1429
1430         rcu_read_lock();
1431         idx = 0;
1432         for_each_netdev_rcu(net, dev) {
1433                 struct netdev_queue *dev_queue;
1434
1435                 if (idx < s_idx)
1436                         goto cont;
1437                 if (idx > s_idx)
1438                         s_q_idx = 0;
1439                 q_idx = 0;
1440
1441                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1442                         goto done;
1443
1444                 dev_queue = dev_ingress_queue(dev);
1445                 if (dev_queue &&
1446                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1447                                        &q_idx, s_q_idx) < 0)
1448                         goto done;
1449
1450 cont:
1451                 idx++;
1452         }
1453
1454 done:
1455         rcu_read_unlock();
1456
1457         cb->args[0] = idx;
1458         cb->args[1] = q_idx;
1459
1460         return skb->len;
1461 }
1462
1463
1464
1465 /************************************************
1466  *      Traffic classes manipulation.           *
1467  ************************************************/
1468
1469
1470
1471 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1472 {
1473         struct net *net = sock_net(skb->sk);
1474         struct tcmsg *tcm = nlmsg_data(n);
1475         struct nlattr *tca[TCA_MAX + 1];
1476         struct net_device *dev;
1477         struct Qdisc *q = NULL;
1478         const struct Qdisc_class_ops *cops;
1479         unsigned long cl = 0;
1480         unsigned long new_cl;
1481         u32 portid;
1482         u32 clid;
1483         u32 qid;
1484         int err;
1485
1486         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1487                 return -EPERM;
1488
1489         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1490         if (err < 0)
1491                 return err;
1492
1493         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1494         if (!dev)
1495                 return -ENODEV;
1496
1497         /*
1498            parent == TC_H_UNSPEC - unspecified parent.
1499            parent == TC_H_ROOT   - class is root, which has no parent.
1500            parent == X:0         - parent is root class.
1501            parent == X:Y         - parent is a node in hierarchy.
1502            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1503
1504            handle == 0:0         - generate handle from kernel pool.
1505            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1506            handle == X:Y         - clear.
1507            handle == X:0         - root class.
1508          */
1509
1510         /* Step 1. Determine qdisc handle X:0 */
1511
1512         portid = tcm->tcm_parent;
1513         clid = tcm->tcm_handle;
1514         qid = TC_H_MAJ(clid);
1515
1516         if (portid != TC_H_ROOT) {
1517                 u32 qid1 = TC_H_MAJ(portid);
1518
1519                 if (qid && qid1) {
1520                         /* If both majors are known, they must be identical. */
1521                         if (qid != qid1)
1522                                 return -EINVAL;
1523                 } else if (qid1) {
1524                         qid = qid1;
1525                 } else if (qid == 0)
1526                         qid = dev->qdisc->handle;
1527
1528                 /* Now qid is genuine qdisc handle consistent
1529                  * both with parent and child.
1530                  *
1531                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1532                  */
1533                 if (portid)
1534                         portid = TC_H_MAKE(qid, portid);
1535         } else {
1536                 if (qid == 0)
1537                         qid = dev->qdisc->handle;
1538         }
1539
1540         /* OK. Locate qdisc */
1541         q = qdisc_lookup(dev, qid);
1542         if (!q)
1543                 return -ENOENT;
1544
1545         /* An check that it supports classes */
1546         cops = q->ops->cl_ops;
1547         if (cops == NULL)
1548                 return -EINVAL;
1549
1550         /* Now try to get class */
1551         if (clid == 0) {
1552                 if (portid == TC_H_ROOT)
1553                         clid = qid;
1554         } else
1555                 clid = TC_H_MAKE(qid, clid);
1556
1557         if (clid)
1558                 cl = cops->get(q, clid);
1559
1560         if (cl == 0) {
1561                 err = -ENOENT;
1562                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1563                     !(n->nlmsg_flags & NLM_F_CREATE))
1564                         goto out;
1565         } else {
1566                 switch (n->nlmsg_type) {
1567                 case RTM_NEWTCLASS:
1568                         err = -EEXIST;
1569                         if (n->nlmsg_flags & NLM_F_EXCL)
1570                                 goto out;
1571                         break;
1572                 case RTM_DELTCLASS:
1573                         err = -EOPNOTSUPP;
1574                         if (cops->delete)
1575                                 err = cops->delete(q, cl);
1576                         if (err == 0)
1577                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1578                         goto out;
1579                 case RTM_GETTCLASS:
1580                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1581                         goto out;
1582                 default:
1583                         err = -EINVAL;
1584                         goto out;
1585                 }
1586         }
1587
1588         new_cl = cl;
1589         err = -EOPNOTSUPP;
1590         if (cops->change)
1591                 err = cops->change(q, clid, portid, tca, &new_cl);
1592         if (err == 0)
1593                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1594
1595 out:
1596         if (cl)
1597                 cops->put(q, cl);
1598
1599         return err;
1600 }
1601
1602
1603 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1604                           unsigned long cl,
1605                           u32 portid, u32 seq, u16 flags, int event)
1606 {
1607         struct tcmsg *tcm;
1608         struct nlmsghdr  *nlh;
1609         unsigned char *b = skb_tail_pointer(skb);
1610         struct gnet_dump d;
1611         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1612
1613         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1614         if (!nlh)
1615                 goto out_nlmsg_trim;
1616         tcm = nlmsg_data(nlh);
1617         tcm->tcm_family = AF_UNSPEC;
1618         tcm->tcm__pad1 = 0;
1619         tcm->tcm__pad2 = 0;
1620         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1621         tcm->tcm_parent = q->handle;
1622         tcm->tcm_handle = q->handle;
1623         tcm->tcm_info = 0;
1624         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1625                 goto nla_put_failure;
1626         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1627                 goto nla_put_failure;
1628
1629         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1630                                          qdisc_root_sleeping_lock(q), &d) < 0)
1631                 goto nla_put_failure;
1632
1633         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1634                 goto nla_put_failure;
1635
1636         if (gnet_stats_finish_copy(&d) < 0)
1637                 goto nla_put_failure;
1638
1639         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1640         return skb->len;
1641
1642 out_nlmsg_trim:
1643 nla_put_failure:
1644         nlmsg_trim(skb, b);
1645         return -1;
1646 }
1647
1648 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1649                          struct nlmsghdr *n, struct Qdisc *q,
1650                          unsigned long cl, int event)
1651 {
1652         struct sk_buff *skb;
1653         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1654
1655         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1656         if (!skb)
1657                 return -ENOBUFS;
1658
1659         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1660                 kfree_skb(skb);
1661                 return -EINVAL;
1662         }
1663
1664         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1665                               n->nlmsg_flags & NLM_F_ECHO);
1666 }
1667
1668 struct qdisc_dump_args {
1669         struct qdisc_walker     w;
1670         struct sk_buff          *skb;
1671         struct netlink_callback *cb;
1672 };
1673
1674 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1675 {
1676         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1677
1678         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1679                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1680 }
1681
1682 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1683                                 struct tcmsg *tcm, struct netlink_callback *cb,
1684                                 int *t_p, int s_t)
1685 {
1686         struct qdisc_dump_args arg;
1687
1688         if (tc_qdisc_dump_ignore(q) ||
1689             *t_p < s_t || !q->ops->cl_ops ||
1690             (tcm->tcm_parent &&
1691              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1692                 (*t_p)++;
1693                 return 0;
1694         }
1695         if (*t_p > s_t)
1696                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1697         arg.w.fn = qdisc_class_dump;
1698         arg.skb = skb;
1699         arg.cb = cb;
1700         arg.w.stop  = 0;
1701         arg.w.skip = cb->args[1];
1702         arg.w.count = 0;
1703         q->ops->cl_ops->walk(q, &arg.w);
1704         cb->args[1] = arg.w.count;
1705         if (arg.w.stop)
1706                 return -1;
1707         (*t_p)++;
1708         return 0;
1709 }
1710
1711 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1712                                struct tcmsg *tcm, struct netlink_callback *cb,
1713                                int *t_p, int s_t)
1714 {
1715         struct Qdisc *q;
1716
1717         if (!root)
1718                 return 0;
1719
1720         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1721                 return -1;
1722
1723         list_for_each_entry(q, &root->list, list) {
1724                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1725                         return -1;
1726         }
1727
1728         return 0;
1729 }
1730
1731 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1732 {
1733         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1734         struct net *net = sock_net(skb->sk);
1735         struct netdev_queue *dev_queue;
1736         struct net_device *dev;
1737         int t, s_t;
1738
1739         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1740                 return 0;
1741         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1742         if (!dev)
1743                 return 0;
1744
1745         s_t = cb->args[0];
1746         t = 0;
1747
1748         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1749                 goto done;
1750
1751         dev_queue = dev_ingress_queue(dev);
1752         if (dev_queue &&
1753             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1754                                 &t, s_t) < 0)
1755                 goto done;
1756
1757 done:
1758         cb->args[0] = t;
1759
1760         dev_put(dev);
1761         return skb->len;
1762 }
1763
1764 /* Main classifier routine: scans classifier chain attached
1765  * to this qdisc, (optionally) tests for protocol and asks
1766  * specific classifiers.
1767  */
1768 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1769                        struct tcf_result *res)
1770 {
1771         __be16 protocol = skb->protocol;
1772         int err;
1773
1774         for (; tp; tp = tp->next) {
1775                 if (tp->protocol != protocol &&
1776                     tp->protocol != htons(ETH_P_ALL))
1777                         continue;
1778                 err = tp->classify(skb, tp, res);
1779
1780                 if (err >= 0) {
1781 #ifdef CONFIG_NET_CLS_ACT
1782                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1783                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1784 #endif
1785                         return err;
1786                 }
1787         }
1788         return -1;
1789 }
1790 EXPORT_SYMBOL(tc_classify_compat);
1791
1792 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1793                 struct tcf_result *res)
1794 {
1795         int err = 0;
1796 #ifdef CONFIG_NET_CLS_ACT
1797         const struct tcf_proto *otp = tp;
1798 reclassify:
1799 #endif
1800
1801         err = tc_classify_compat(skb, tp, res);
1802 #ifdef CONFIG_NET_CLS_ACT
1803         if (err == TC_ACT_RECLASSIFY) {
1804                 u32 verd = G_TC_VERD(skb->tc_verd);
1805                 tp = otp;
1806
1807                 if (verd++ >= MAX_REC_LOOP) {
1808                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1809                                                tp->q->ops->id,
1810                                                tp->prio & 0xffff,
1811                                                ntohs(tp->protocol));
1812                         return TC_ACT_SHOT;
1813                 }
1814                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1815                 goto reclassify;
1816         }
1817 #endif
1818         return err;
1819 }
1820 EXPORT_SYMBOL(tc_classify);
1821
1822 void tcf_destroy(struct tcf_proto *tp)
1823 {
1824         tp->ops->destroy(tp);
1825         module_put(tp->ops->owner);
1826         kfree(tp);
1827 }
1828
1829 void tcf_destroy_chain(struct tcf_proto **fl)
1830 {
1831         struct tcf_proto *tp;
1832
1833         while ((tp = *fl) != NULL) {
1834                 *fl = tp->next;
1835                 tcf_destroy(tp);
1836         }
1837 }
1838 EXPORT_SYMBOL(tcf_destroy_chain);
1839
1840 #ifdef CONFIG_PROC_FS
1841 static int psched_show(struct seq_file *seq, void *v)
1842 {
1843         struct timespec ts;
1844
1845         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1846         seq_printf(seq, "%08x %08x %08x %08x\n",
1847                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1848                    1000000,
1849                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1850
1851         return 0;
1852 }
1853
1854 static int psched_open(struct inode *inode, struct file *file)
1855 {
1856         return single_open(file, psched_show, NULL);
1857 }
1858
1859 static const struct file_operations psched_fops = {
1860         .owner = THIS_MODULE,
1861         .open = psched_open,
1862         .read  = seq_read,
1863         .llseek = seq_lseek,
1864         .release = single_release,
1865 };
1866
1867 static int __net_init psched_net_init(struct net *net)
1868 {
1869         struct proc_dir_entry *e;
1870
1871         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1872         if (e == NULL)
1873                 return -ENOMEM;
1874
1875         return 0;
1876 }
1877
1878 static void __net_exit psched_net_exit(struct net *net)
1879 {
1880         remove_proc_entry("psched", net->proc_net);
1881 }
1882 #else
1883 static int __net_init psched_net_init(struct net *net)
1884 {
1885         return 0;
1886 }
1887
1888 static void __net_exit psched_net_exit(struct net *net)
1889 {
1890 }
1891 #endif
1892
1893 static struct pernet_operations psched_net_ops = {
1894         .init = psched_net_init,
1895         .exit = psched_net_exit,
1896 };
1897
1898 static int __init pktsched_init(void)
1899 {
1900         int err;
1901
1902         err = register_pernet_subsys(&psched_net_ops);
1903         if (err) {
1904                 pr_err("pktsched_init: "
1905                        "cannot initialize per netns operations\n");
1906                 return err;
1907         }
1908
1909         register_qdisc(&pfifo_fast_ops);
1910         register_qdisc(&pfifo_qdisc_ops);
1911         register_qdisc(&bfifo_qdisc_ops);
1912         register_qdisc(&pfifo_head_drop_qdisc_ops);
1913         register_qdisc(&mq_qdisc_ops);
1914
1915         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1916         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1917         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1918         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1919         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1920         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1921
1922         return 0;
1923 }
1924
1925 subsys_initcall(pktsched_init);