]> git.karo-electronics.de Git - linux-beck.git/blob - net/sched/sch_generic.c
816d311db40fbb5531efbf529ddf2d1421490997
[linux-beck.git] / net / sched / sch_generic.c
1 /*
2  * net/sched/sch_generic.c      Generic packet scheduler routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36
37 /* Main transmission queue. */
38
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52         spin_lock_bh(&dev->queue_lock);
53         spin_lock(&dev->ingress_lock);
54 }
55
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58         spin_unlock(&dev->ingress_lock);
59         spin_unlock_bh(&dev->queue_lock);
60 }
61
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65
66    netif_tx_lock serializes accesses to device driver.
67
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71
72
73 /* Kick device.
74
75    Returns:  0  - queue is empty or throttled.
76             >0  - queue is not empty.
77
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83         struct Qdisc *q = dev->qdisc;
84         struct sk_buff *skb;
85
86         /* Dequeue packet */
87         if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88                 unsigned nolock = (dev->features & NETIF_F_LLTX);
89
90                 dev->gso_skb = NULL;
91
92                 /*
93                  * When the driver has LLTX set it does its own locking
94                  * in start_xmit. No need to add additional overhead by
95                  * locking again. These checks are worth it because
96                  * even uncongested locks can be quite expensive.
97                  * The driver can do trylock like here too, in case
98                  * of lock congestion it should return -1 and the packet
99                  * will be requeued.
100                  */
101                 if (!nolock) {
102                         if (!netif_tx_trylock(dev)) {
103                         collision:
104                                 /* So, someone grabbed the driver. */
105
106                                 /* It may be transient configuration error,
107                                    when hard_start_xmit() recurses. We detect
108                                    it by checking xmit owner and drop the
109                                    packet when deadloop is detected.
110                                 */
111                                 if (dev->xmit_lock_owner == smp_processor_id()) {
112                                         kfree_skb(skb);
113                                         if (net_ratelimit())
114                                                 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115                                         goto out;
116                                 }
117                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
118                                 goto requeue;
119                         }
120                 }
121
122                 {
123                         /* And release queue */
124                         spin_unlock(&dev->queue_lock);
125
126                         if (!netif_queue_stopped(dev)) {
127                                 int ret;
128
129                                 ret = dev_hard_start_xmit(skb, dev);
130                                 if (ret == NETDEV_TX_OK) {
131                                         if (!nolock) {
132                                                 netif_tx_unlock(dev);
133                                         }
134                                         spin_lock(&dev->queue_lock);
135                                         q = dev->qdisc;
136                                         goto out;
137                                 }
138                                 if (ret == NETDEV_TX_LOCKED && nolock) {
139                                         spin_lock(&dev->queue_lock);
140                                         q = dev->qdisc;
141                                         goto collision;
142                                 }
143                         }
144
145                         /* NETDEV_TX_BUSY - we need to requeue */
146                         /* Release the driver */
147                         if (!nolock) {
148                                 netif_tx_unlock(dev);
149                         }
150                         spin_lock(&dev->queue_lock);
151                         q = dev->qdisc;
152                 }
153
154                 /* Device kicked us out :(
155                    This is possible in three cases:
156
157                    0. driver is locked
158                    1. fastroute is enabled
159                    2. device cannot determine busy state
160                       before start of transmission (f.e. dialout)
161                    3. device is buggy (ppp)
162                  */
163
164 requeue:
165                 if (skb->next)
166                         dev->gso_skb = skb;
167                 else
168                         q->ops->requeue(skb, q);
169                 netif_schedule(dev);
170                 return 0;
171         }
172
173 out:
174         BUG_ON((int) q->q.qlen < 0);
175         return q->q.qlen;
176 }
177
178 void __qdisc_run(struct net_device *dev)
179 {
180         if (unlikely(dev->qdisc == &noop_qdisc))
181                 goto out;
182
183         do {
184                 if (!qdisc_restart(dev))
185                         break;
186         } while (!netif_queue_stopped(dev));
187
188 out:
189         clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
190 }
191
192 static void dev_watchdog(unsigned long arg)
193 {
194         struct net_device *dev = (struct net_device *)arg;
195
196         netif_tx_lock(dev);
197         if (dev->qdisc != &noop_qdisc) {
198                 if (netif_device_present(dev) &&
199                     netif_running(dev) &&
200                     netif_carrier_ok(dev)) {
201                         if (netif_queue_stopped(dev) &&
202                             time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
203
204                                 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
205                                        dev->name);
206                                 dev->tx_timeout(dev);
207                         }
208                         if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
209                                 dev_hold(dev);
210                 }
211         }
212         netif_tx_unlock(dev);
213
214         dev_put(dev);
215 }
216
217 static void dev_watchdog_init(struct net_device *dev)
218 {
219         init_timer(&dev->watchdog_timer);
220         dev->watchdog_timer.data = (unsigned long)dev;
221         dev->watchdog_timer.function = dev_watchdog;
222 }
223
224 void __netdev_watchdog_up(struct net_device *dev)
225 {
226         if (dev->tx_timeout) {
227                 if (dev->watchdog_timeo <= 0)
228                         dev->watchdog_timeo = 5*HZ;
229                 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
230                         dev_hold(dev);
231         }
232 }
233
234 static void dev_watchdog_up(struct net_device *dev)
235 {
236         __netdev_watchdog_up(dev);
237 }
238
239 static void dev_watchdog_down(struct net_device *dev)
240 {
241         netif_tx_lock_bh(dev);
242         if (del_timer(&dev->watchdog_timer))
243                 dev_put(dev);
244         netif_tx_unlock_bh(dev);
245 }
246
247 void netif_carrier_on(struct net_device *dev)
248 {
249         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
250                 linkwatch_fire_event(dev);
251         if (netif_running(dev))
252                 __netdev_watchdog_up(dev);
253 }
254
255 void netif_carrier_off(struct net_device *dev)
256 {
257         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
258                 linkwatch_fire_event(dev);
259 }
260
261 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
262    under all circumstances. It is difficult to invent anything faster or
263    cheaper.
264  */
265
266 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
267 {
268         kfree_skb(skb);
269         return NET_XMIT_CN;
270 }
271
272 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
273 {
274         return NULL;
275 }
276
277 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279         if (net_ratelimit())
280                 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
281                        skb->dev->name);
282         kfree_skb(skb);
283         return NET_XMIT_CN;
284 }
285
286 struct Qdisc_ops noop_qdisc_ops = {
287         .id             =       "noop",
288         .priv_size      =       0,
289         .enqueue        =       noop_enqueue,
290         .dequeue        =       noop_dequeue,
291         .requeue        =       noop_requeue,
292         .owner          =       THIS_MODULE,
293 };
294
295 struct Qdisc noop_qdisc = {
296         .enqueue        =       noop_enqueue,
297         .dequeue        =       noop_dequeue,
298         .flags          =       TCQ_F_BUILTIN,
299         .ops            =       &noop_qdisc_ops,
300         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
301 };
302
303 static struct Qdisc_ops noqueue_qdisc_ops = {
304         .id             =       "noqueue",
305         .priv_size      =       0,
306         .enqueue        =       noop_enqueue,
307         .dequeue        =       noop_dequeue,
308         .requeue        =       noop_requeue,
309         .owner          =       THIS_MODULE,
310 };
311
312 static struct Qdisc noqueue_qdisc = {
313         .enqueue        =       NULL,
314         .dequeue        =       noop_dequeue,
315         .flags          =       TCQ_F_BUILTIN,
316         .ops            =       &noqueue_qdisc_ops,
317         .list           =       LIST_HEAD_INIT(noqueue_qdisc.list),
318 };
319
320
321 static const u8 prio2band[TC_PRIO_MAX+1] =
322         { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
323
324 /* 3-band FIFO queue: old style, but should be a bit faster than
325    generic prio+fifo combination.
326  */
327
328 #define PFIFO_FAST_BANDS 3
329
330 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
331                                              struct Qdisc *qdisc)
332 {
333         struct sk_buff_head *list = qdisc_priv(qdisc);
334         return list + prio2band[skb->priority & TC_PRIO_MAX];
335 }
336
337 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
338 {
339         struct sk_buff_head *list = prio2list(skb, qdisc);
340
341         if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
342                 qdisc->q.qlen++;
343                 return __qdisc_enqueue_tail(skb, qdisc, list);
344         }
345
346         return qdisc_drop(skb, qdisc);
347 }
348
349 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
350 {
351         int prio;
352         struct sk_buff_head *list = qdisc_priv(qdisc);
353
354         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
355                 if (!skb_queue_empty(list + prio)) {
356                         qdisc->q.qlen--;
357                         return __qdisc_dequeue_head(qdisc, list + prio);
358                 }
359         }
360
361         return NULL;
362 }
363
364 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
365 {
366         qdisc->q.qlen++;
367         return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
368 }
369
370 static void pfifo_fast_reset(struct Qdisc* qdisc)
371 {
372         int prio;
373         struct sk_buff_head *list = qdisc_priv(qdisc);
374
375         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
376                 __qdisc_reset_queue(qdisc, list + prio);
377
378         qdisc->qstats.backlog = 0;
379         qdisc->q.qlen = 0;
380 }
381
382 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
383 {
384         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
385
386         memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
387         RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
388         return skb->len;
389
390 rtattr_failure:
391         return -1;
392 }
393
394 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
395 {
396         int prio;
397         struct sk_buff_head *list = qdisc_priv(qdisc);
398
399         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
400                 skb_queue_head_init(list + prio);
401
402         return 0;
403 }
404
405 static struct Qdisc_ops pfifo_fast_ops = {
406         .id             =       "pfifo_fast",
407         .priv_size      =       PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
408         .enqueue        =       pfifo_fast_enqueue,
409         .dequeue        =       pfifo_fast_dequeue,
410         .requeue        =       pfifo_fast_requeue,
411         .init           =       pfifo_fast_init,
412         .reset          =       pfifo_fast_reset,
413         .dump           =       pfifo_fast_dump,
414         .owner          =       THIS_MODULE,
415 };
416
417 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
418 {
419         void *p;
420         struct Qdisc *sch;
421         unsigned int size;
422         int err = -ENOBUFS;
423
424         /* ensure that the Qdisc and the private data are 32-byte aligned */
425         size = QDISC_ALIGN(sizeof(*sch));
426         size += ops->priv_size + (QDISC_ALIGNTO - 1);
427
428         p = kzalloc(size, GFP_KERNEL);
429         if (!p)
430                 goto errout;
431         sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
432         sch->padded = (char *) sch - (char *) p;
433
434         INIT_LIST_HEAD(&sch->list);
435         skb_queue_head_init(&sch->q);
436         sch->ops = ops;
437         sch->enqueue = ops->enqueue;
438         sch->dequeue = ops->dequeue;
439         sch->dev = dev;
440         dev_hold(dev);
441         atomic_set(&sch->refcnt, 1);
442
443         return sch;
444 errout:
445         return ERR_PTR(-err);
446 }
447
448 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
449                                  unsigned int parentid)
450 {
451         struct Qdisc *sch;
452
453         sch = qdisc_alloc(dev, ops);
454         if (IS_ERR(sch))
455                 goto errout;
456         sch->stats_lock = &dev->queue_lock;
457         sch->parent = parentid;
458
459         if (!ops->init || ops->init(sch, NULL) == 0)
460                 return sch;
461
462         qdisc_destroy(sch);
463 errout:
464         return NULL;
465 }
466
467 /* Under dev->queue_lock and BH! */
468
469 void qdisc_reset(struct Qdisc *qdisc)
470 {
471         struct Qdisc_ops *ops = qdisc->ops;
472
473         if (ops->reset)
474                 ops->reset(qdisc);
475 }
476
477 /* this is the rcu callback function to clean up a qdisc when there
478  * are no further references to it */
479
480 static void __qdisc_destroy(struct rcu_head *head)
481 {
482         struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
483         kfree((char *) qdisc - qdisc->padded);
484 }
485
486 /* Under dev->queue_lock and BH! */
487
488 void qdisc_destroy(struct Qdisc *qdisc)
489 {
490         struct Qdisc_ops  *ops = qdisc->ops;
491
492         if (qdisc->flags & TCQ_F_BUILTIN ||
493             !atomic_dec_and_test(&qdisc->refcnt))
494                 return;
495
496         list_del(&qdisc->list);
497 #ifdef CONFIG_NET_ESTIMATOR
498         gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
499 #endif
500         if (ops->reset)
501                 ops->reset(qdisc);
502         if (ops->destroy)
503                 ops->destroy(qdisc);
504
505         module_put(ops->owner);
506         dev_put(qdisc->dev);
507         call_rcu(&qdisc->q_rcu, __qdisc_destroy);
508 }
509
510 void dev_activate(struct net_device *dev)
511 {
512         /* No queueing discipline is attached to device;
513            create default one i.e. pfifo_fast for devices,
514            which need queueing and noqueue_qdisc for
515            virtual interfaces
516          */
517
518         if (dev->qdisc_sleeping == &noop_qdisc) {
519                 struct Qdisc *qdisc;
520                 if (dev->tx_queue_len) {
521                         qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
522                                                   TC_H_ROOT);
523                         if (qdisc == NULL) {
524                                 printk(KERN_INFO "%s: activation failed\n", dev->name);
525                                 return;
526                         }
527                         list_add_tail(&qdisc->list, &dev->qdisc_list);
528                 } else {
529                         qdisc =  &noqueue_qdisc;
530                 }
531                 dev->qdisc_sleeping = qdisc;
532         }
533
534         if (!netif_carrier_ok(dev))
535                 /* Delay activation until next carrier-on event */
536                 return;
537
538         spin_lock_bh(&dev->queue_lock);
539         rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
540         if (dev->qdisc != &noqueue_qdisc) {
541                 dev->trans_start = jiffies;
542                 dev_watchdog_up(dev);
543         }
544         spin_unlock_bh(&dev->queue_lock);
545 }
546
547 void dev_deactivate(struct net_device *dev)
548 {
549         struct Qdisc *qdisc;
550
551         spin_lock_bh(&dev->queue_lock);
552         qdisc = dev->qdisc;
553         dev->qdisc = &noop_qdisc;
554
555         qdisc_reset(qdisc);
556
557         spin_unlock_bh(&dev->queue_lock);
558
559         dev_watchdog_down(dev);
560
561         /* Wait for outstanding dev_queue_xmit calls. */
562         synchronize_rcu();
563
564         /* Wait for outstanding qdisc_run calls. */
565         while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
566                 yield();
567
568         if (dev->gso_skb) {
569                 kfree_skb(dev->gso_skb);
570                 dev->gso_skb = NULL;
571         }
572 }
573
574 void dev_init_scheduler(struct net_device *dev)
575 {
576         qdisc_lock_tree(dev);
577         dev->qdisc = &noop_qdisc;
578         dev->qdisc_sleeping = &noop_qdisc;
579         INIT_LIST_HEAD(&dev->qdisc_list);
580         qdisc_unlock_tree(dev);
581
582         dev_watchdog_init(dev);
583 }
584
585 void dev_shutdown(struct net_device *dev)
586 {
587         struct Qdisc *qdisc;
588
589         qdisc_lock_tree(dev);
590         qdisc = dev->qdisc_sleeping;
591         dev->qdisc = &noop_qdisc;
592         dev->qdisc_sleeping = &noop_qdisc;
593         qdisc_destroy(qdisc);
594 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
595         if ((qdisc = dev->qdisc_ingress) != NULL) {
596                 dev->qdisc_ingress = NULL;
597                 qdisc_destroy(qdisc);
598         }
599 #endif
600         BUG_TRAP(!timer_pending(&dev->watchdog_timer));
601         qdisc_unlock_tree(dev);
602 }
603
604 EXPORT_SYMBOL(netif_carrier_on);
605 EXPORT_SYMBOL(netif_carrier_off);
606 EXPORT_SYMBOL(noop_qdisc);
607 EXPORT_SYMBOL(qdisc_create_dflt);
608 EXPORT_SYMBOL(qdisc_destroy);
609 EXPORT_SYMBOL(qdisc_reset);
610 EXPORT_SYMBOL(qdisc_lock_tree);
611 EXPORT_SYMBOL(qdisc_unlock_tree);