2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/netdevice.h>
23 #include <linux/skbuff.h>
24 #include <linux/rtnetlink.h>
26 #include <net/pkt_sched.h>
28 /* Network Emulation Queuing algorithm.
29 ====================================
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
35 ----------------------------------------------------------------
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
54 struct netem_sched_data {
56 struct timer_list timer;
70 } delay_cor, loss_cor, dup_cor, reorder_cor;
78 /* Time stamp put into socket buffer control block */
80 psched_time_t time_to_send;
83 /* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
86 static void init_crandom(struct crndstate *state, unsigned long rho)
89 state->last = net_random();
92 /* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
96 static unsigned long get_crandom(struct crndstate *state)
101 if (state->rho == 0) /* no correllation */
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
111 /* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
115 static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
124 rnd = get_crandom(state);
126 /* default uniform distribution */
128 return (rnd % (2*sigma)) - sigma + mu;
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
133 x += NETEM_DIST_SCALE/2;
135 x -= NETEM_DIST_SCALE/2;
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
141 * Insert one skb into qdisc.
142 * Note: parent depends on return value to account for queue length.
143 * NET_XMIT_DROP: queue length didn't change.
144 * NET_XMIT_SUCCESS: one skb was queued.
146 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
148 struct netem_sched_data *q = qdisc_priv(sch);
149 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
150 struct sk_buff *skb2;
154 pr_debug("netem_enqueue skb=%p\n", skb);
156 /* Random duplication */
157 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
160 /* Random packet drop 0 => none, ~0 => all */
161 if (q->loss && q->loss >= get_crandom(&q->loss_cor))
167 return NET_XMIT_DROP;
171 * If we need to duplicate packet, then re-insert at top of the
172 * qdisc tree, since parent queuer expects that only one
173 * skb will be queued.
175 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
176 struct Qdisc *rootq = sch->dev->qdisc;
177 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
180 rootq->enqueue(skb2, rootq);
181 q->duplicate = dupsave;
184 if (q->gap == 0 /* not doing reordering */
185 || q->counter < q->gap /* inside last reordering gap */
186 || q->reorder < get_crandom(&q->reorder_cor)) {
188 psched_tdiff_t delay;
190 delay = tabledist(q->latency, q->jitter,
191 &q->delay_cor, q->delay_dist);
193 PSCHED_GET_TIME(now);
194 PSCHED_TADD2(now, delay, cb->time_to_send);
196 ret = q->qdisc->enqueue(skb, q->qdisc);
199 * Do re-ordering by putting one out of N packets at the front
202 PSCHED_GET_TIME(cb->time_to_send);
204 ret = q->qdisc->ops->requeue(skb, q->qdisc);
207 if (likely(ret == NET_XMIT_SUCCESS)) {
209 sch->bstats.bytes += skb->len;
210 sch->bstats.packets++;
214 pr_debug("netem: enqueue ret %d\n", ret);
218 /* Requeue packets but don't change time stamp */
219 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
221 struct netem_sched_data *q = qdisc_priv(sch);
224 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
226 sch->qstats.requeues++;
232 static unsigned int netem_drop(struct Qdisc* sch)
234 struct netem_sched_data *q = qdisc_priv(sch);
237 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
244 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
246 struct netem_sched_data *q = qdisc_priv(sch);
249 skb = q->qdisc->dequeue(q->qdisc);
251 const struct netem_skb_cb *cb
252 = (const struct netem_skb_cb *)skb->cb;
255 /* if more time remaining? */
256 PSCHED_GET_TIME(now);
258 if (PSCHED_TLESS(cb->time_to_send, now)) {
259 pr_debug("netem_dequeue: return skb=%p\n", skb);
261 sch->flags &= ~TCQ_F_THROTTLED;
264 psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
266 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
269 /* After this qlen is confused */
270 printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
276 mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
277 sch->flags |= TCQ_F_THROTTLED;
284 static void netem_watchdog(unsigned long arg)
286 struct Qdisc *sch = (struct Qdisc *)arg;
288 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
289 sch->flags &= ~TCQ_F_THROTTLED;
290 netif_schedule(sch->dev);
293 static void netem_reset(struct Qdisc *sch)
295 struct netem_sched_data *q = qdisc_priv(sch);
297 qdisc_reset(q->qdisc);
299 sch->flags &= ~TCQ_F_THROTTLED;
300 del_timer_sync(&q->timer);
303 /* Pass size change message down to embedded FIFO */
304 static int set_fifo_limit(struct Qdisc *q, int limit)
309 /* Hack to avoid sending change message to non-FIFO */
310 if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
313 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
315 rta->rta_type = RTM_NEWQDISC;
316 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
317 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
319 ret = q->ops->change(q, rta);
326 * Distribution data is a variable size payload containing
327 * signed 16 bit values.
329 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
331 struct netem_sched_data *q = qdisc_priv(sch);
332 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
333 const __s16 *data = RTA_DATA(attr);
340 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
345 for (i = 0; i < n; i++)
346 d->table[i] = data[i];
348 spin_lock_bh(&sch->dev->queue_lock);
349 d = xchg(&q->delay_dist, d);
350 spin_unlock_bh(&sch->dev->queue_lock);
356 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
358 struct netem_sched_data *q = qdisc_priv(sch);
359 const struct tc_netem_corr *c = RTA_DATA(attr);
361 if (RTA_PAYLOAD(attr) != sizeof(*c))
364 init_crandom(&q->delay_cor, c->delay_corr);
365 init_crandom(&q->loss_cor, c->loss_corr);
366 init_crandom(&q->dup_cor, c->dup_corr);
370 static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
372 struct netem_sched_data *q = qdisc_priv(sch);
373 const struct tc_netem_reorder *r = RTA_DATA(attr);
375 if (RTA_PAYLOAD(attr) != sizeof(*r))
378 q->reorder = r->probability;
379 init_crandom(&q->reorder_cor, r->correlation);
383 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
385 struct netem_sched_data *q = qdisc_priv(sch);
386 struct tc_netem_qopt *qopt;
389 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
392 qopt = RTA_DATA(opt);
393 ret = set_fifo_limit(q->qdisc, qopt->limit);
395 pr_debug("netem: can't set fifo limit\n");
399 q->latency = qopt->latency;
400 q->jitter = qopt->jitter;
401 q->limit = qopt->limit;
404 q->loss = qopt->loss;
405 q->duplicate = qopt->duplicate;
407 /* for compatiablity with earlier versions.
408 * if gap is set, need to assume 100% probablity
412 /* Handle nested options after initial queue options.
413 * Should have put all options in nested format but too late now.
415 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
416 struct rtattr *tb[TCA_NETEM_MAX];
417 if (rtattr_parse(tb, TCA_NETEM_MAX,
418 RTA_DATA(opt) + sizeof(*qopt),
419 RTA_PAYLOAD(opt) - sizeof(*qopt)))
422 if (tb[TCA_NETEM_CORR-1]) {
423 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
428 if (tb[TCA_NETEM_DELAY_DIST-1]) {
429 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
433 if (tb[TCA_NETEM_REORDER-1]) {
434 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
445 * Special case version of FIFO queue for use by netem.
446 * It queues in order based on timestamps in skb's
448 struct fifo_sched_data {
452 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
454 struct fifo_sched_data *q = qdisc_priv(sch);
455 struct sk_buff_head *list = &sch->q;
456 const struct netem_skb_cb *ncb
457 = (const struct netem_skb_cb *)nskb->cb;
460 if (likely(skb_queue_len(list) < q->limit)) {
461 skb_queue_reverse_walk(list, skb) {
462 const struct netem_skb_cb *cb
463 = (const struct netem_skb_cb *)skb->cb;
465 if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
469 __skb_queue_after(list, skb, nskb);
471 sch->qstats.backlog += nskb->len;
472 sch->bstats.bytes += nskb->len;
473 sch->bstats.packets++;
475 return NET_XMIT_SUCCESS;
478 return qdisc_drop(nskb, sch);
481 static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
483 struct fifo_sched_data *q = qdisc_priv(sch);
486 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
487 if (RTA_PAYLOAD(opt) < sizeof(*ctl))
490 q->limit = ctl->limit;
492 q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
497 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
499 struct fifo_sched_data *q = qdisc_priv(sch);
500 struct tc_fifo_qopt opt = { .limit = q->limit };
502 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
509 static struct Qdisc_ops tfifo_qdisc_ops = {
511 .priv_size = sizeof(struct fifo_sched_data),
512 .enqueue = tfifo_enqueue,
513 .dequeue = qdisc_dequeue_head,
514 .requeue = qdisc_requeue,
515 .drop = qdisc_queue_drop,
517 .reset = qdisc_reset_queue,
518 .change = tfifo_init,
522 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
524 struct netem_sched_data *q = qdisc_priv(sch);
530 init_timer(&q->timer);
531 q->timer.function = netem_watchdog;
532 q->timer.data = (unsigned long) sch;
534 q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
536 pr_debug("netem: qdisc create failed\n");
540 ret = netem_change(sch, opt);
542 pr_debug("netem: change failed\n");
543 qdisc_destroy(q->qdisc);
548 static void netem_destroy(struct Qdisc *sch)
550 struct netem_sched_data *q = qdisc_priv(sch);
552 del_timer_sync(&q->timer);
553 qdisc_destroy(q->qdisc);
554 kfree(q->delay_dist);
557 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
559 const struct netem_sched_data *q = qdisc_priv(sch);
560 unsigned char *b = skb->tail;
561 struct rtattr *rta = (struct rtattr *) b;
562 struct tc_netem_qopt qopt;
563 struct tc_netem_corr cor;
564 struct tc_netem_reorder reorder;
566 qopt.latency = q->latency;
567 qopt.jitter = q->jitter;
568 qopt.limit = q->limit;
571 qopt.duplicate = q->duplicate;
572 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
574 cor.delay_corr = q->delay_cor.rho;
575 cor.loss_corr = q->loss_cor.rho;
576 cor.dup_corr = q->dup_cor.rho;
577 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
579 reorder.probability = q->reorder;
580 reorder.correlation = q->reorder_cor.rho;
581 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
583 rta->rta_len = skb->tail - b;
588 skb_trim(skb, b - skb->data);
592 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
593 struct sk_buff *skb, struct tcmsg *tcm)
595 struct netem_sched_data *q = qdisc_priv(sch);
597 if (cl != 1) /* only one class */
600 tcm->tcm_handle |= TC_H_MIN(1);
601 tcm->tcm_info = q->qdisc->handle;
606 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
609 struct netem_sched_data *q = qdisc_priv(sch);
615 *old = xchg(&q->qdisc, new);
618 sch_tree_unlock(sch);
623 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
625 struct netem_sched_data *q = qdisc_priv(sch);
629 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
634 static void netem_put(struct Qdisc *sch, unsigned long arg)
638 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
639 struct rtattr **tca, unsigned long *arg)
644 static int netem_delete(struct Qdisc *sch, unsigned long arg)
649 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
652 if (walker->count >= walker->skip)
653 if (walker->fn(sch, 1, walker) < 0) {
661 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
666 static struct Qdisc_class_ops netem_class_ops = {
667 .graft = netem_graft,
671 .change = netem_change_class,
672 .delete = netem_delete,
674 .tcf_chain = netem_find_tcf,
675 .dump = netem_dump_class,
678 static struct Qdisc_ops netem_qdisc_ops = {
680 .cl_ops = &netem_class_ops,
681 .priv_size = sizeof(struct netem_sched_data),
682 .enqueue = netem_enqueue,
683 .dequeue = netem_dequeue,
684 .requeue = netem_requeue,
687 .reset = netem_reset,
688 .destroy = netem_destroy,
689 .change = netem_change,
691 .owner = THIS_MODULE,
695 static int __init netem_module_init(void)
697 return register_qdisc(&netem_qdisc_ops);
699 static void __exit netem_module_exit(void)
701 unregister_qdisc(&netem_qdisc_ops);
703 module_init(netem_module_init)
704 module_exit(netem_module_exit)
705 MODULE_LICENSE("GPL");