2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/module.h>
17 #include <linux/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/netdevice.h>
22 #include <linux/skbuff.h>
23 #include <linux/rtnetlink.h>
25 #include <net/pkt_sched.h>
29 /* Network Emulation Queuing algorithm.
30 ====================================
32 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
33 Network Emulation Tool
34 [2] Luigi Rizzo, DummyNet for FreeBSD
36 ----------------------------------------------------------------
38 This started out as a simple way to delay outgoing packets to
39 test TCP but has grown to include most of the functionality
40 of a full blown network emulator like NISTnet. It can delay
41 packets and add random jitter (and correlation). The random
42 distribution can be loaded from a table as well to provide
43 normal, Pareto, or experimental curves. Packet loss,
44 duplication, and reordering can also be emulated.
46 This qdisc does not do classification that can be handled in
47 layering other disciplines. It does not need to do bandwidth
48 control either since that can be handled by using token
49 bucket or other rate control.
51 The simulator is limited by the Linux timer resolution
52 and will create packet bursts on the HZ boundary (1ms).
55 struct netem_sched_data {
57 struct timer_list timer;
72 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
80 /* Time stamp put into socket buffer control block */
82 psched_time_t time_to_send;
85 /* init_crandom - initialize correlated random number generator
86 * Use entropy source for initial seed.
88 static void init_crandom(struct crndstate *state, unsigned long rho)
91 state->last = net_random();
94 /* get_crandom - correlated random number generator
95 * Next number depends on last value.
96 * rho is scaled to avoid floating point.
98 static unsigned long get_crandom(struct crndstate *state)
101 unsigned long answer;
103 if (state->rho == 0) /* no correllation */
106 value = net_random();
107 rho = (u64)state->rho + 1;
108 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
109 state->last = answer;
113 /* tabledist - return a pseudo-randomly distributed value with mean mu and
114 * std deviation sigma. Uses table lookup to approximate the desired
115 * distribution, and a uniformly-distributed pseudo-random source.
117 static long tabledist(unsigned long mu, long sigma,
118 struct crndstate *state, const struct disttable *dist)
126 rnd = get_crandom(state);
128 /* default uniform distribution */
130 return (rnd % (2*sigma)) - sigma + mu;
132 t = dist->table[rnd % dist->size];
133 x = (sigma % NETEM_DIST_SCALE) * t;
135 x += NETEM_DIST_SCALE/2;
137 x -= NETEM_DIST_SCALE/2;
139 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
143 * Insert one skb into qdisc.
144 * Note: parent depends on return value to account for queue length.
145 * NET_XMIT_DROP: queue length didn't change.
146 * NET_XMIT_SUCCESS: one skb was queued.
148 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
150 struct netem_sched_data *q = qdisc_priv(sch);
151 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
152 struct sk_buff *skb2;
156 pr_debug("netem_enqueue skb=%p\n", skb);
158 /* Random duplication */
159 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
162 /* Random packet drop 0 => none, ~0 => all */
163 if (q->loss && q->loss >= get_crandom(&q->loss_cor))
169 return NET_XMIT_BYPASS;
173 * If we need to duplicate packet, then re-insert at top of the
174 * qdisc tree, since parent queuer expects that only one
175 * skb will be queued.
177 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
178 struct Qdisc *rootq = sch->dev->qdisc;
179 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
182 rootq->enqueue(skb2, rootq);
183 q->duplicate = dupsave;
187 * Randomized packet corruption.
188 * Make copy if needed since we are modifying
189 * If packet is going to be hardware checksummed, then
190 * do it now in software before we mangle it.
192 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
193 if (!(skb = skb_unshare(skb, GFP_ATOMIC))
194 || (skb->ip_summed == CHECKSUM_HW
195 && skb_checksum_help(skb, 0))) {
197 return NET_XMIT_DROP;
200 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
203 if (q->gap == 0 /* not doing reordering */
204 || q->counter < q->gap /* inside last reordering gap */
205 || q->reorder < get_crandom(&q->reorder_cor)) {
207 psched_tdiff_t delay;
209 delay = tabledist(q->latency, q->jitter,
210 &q->delay_cor, q->delay_dist);
212 PSCHED_GET_TIME(now);
213 PSCHED_TADD2(now, delay, cb->time_to_send);
215 ret = q->qdisc->enqueue(skb, q->qdisc);
218 * Do re-ordering by putting one out of N packets at the front
221 PSCHED_GET_TIME(cb->time_to_send);
223 ret = q->qdisc->ops->requeue(skb, q->qdisc);
226 if (likely(ret == NET_XMIT_SUCCESS)) {
228 sch->bstats.bytes += skb->len;
229 sch->bstats.packets++;
233 pr_debug("netem: enqueue ret %d\n", ret);
237 /* Requeue packets but don't change time stamp */
238 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
240 struct netem_sched_data *q = qdisc_priv(sch);
243 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
245 sch->qstats.requeues++;
251 static unsigned int netem_drop(struct Qdisc* sch)
253 struct netem_sched_data *q = qdisc_priv(sch);
254 unsigned int len = 0;
256 if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
263 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
265 struct netem_sched_data *q = qdisc_priv(sch);
268 skb = q->qdisc->dequeue(q->qdisc);
270 const struct netem_skb_cb *cb
271 = (const struct netem_skb_cb *)skb->cb;
274 /* if more time remaining? */
275 PSCHED_GET_TIME(now);
277 if (PSCHED_TLESS(cb->time_to_send, now)) {
278 pr_debug("netem_dequeue: return skb=%p\n", skb);
280 sch->flags &= ~TCQ_F_THROTTLED;
283 psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
285 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
288 /* After this qlen is confused */
289 printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
295 mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
296 sch->flags |= TCQ_F_THROTTLED;
303 static void netem_watchdog(unsigned long arg)
305 struct Qdisc *sch = (struct Qdisc *)arg;
307 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
308 sch->flags &= ~TCQ_F_THROTTLED;
309 netif_schedule(sch->dev);
312 static void netem_reset(struct Qdisc *sch)
314 struct netem_sched_data *q = qdisc_priv(sch);
316 qdisc_reset(q->qdisc);
318 sch->flags &= ~TCQ_F_THROTTLED;
319 del_timer_sync(&q->timer);
322 /* Pass size change message down to embedded FIFO */
323 static int set_fifo_limit(struct Qdisc *q, int limit)
328 /* Hack to avoid sending change message to non-FIFO */
329 if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
332 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
334 rta->rta_type = RTM_NEWQDISC;
335 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
336 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
338 ret = q->ops->change(q, rta);
345 * Distribution data is a variable size payload containing
346 * signed 16 bit values.
348 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
350 struct netem_sched_data *q = qdisc_priv(sch);
351 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
352 const __s16 *data = RTA_DATA(attr);
359 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
364 for (i = 0; i < n; i++)
365 d->table[i] = data[i];
367 spin_lock_bh(&sch->dev->queue_lock);
368 d = xchg(&q->delay_dist, d);
369 spin_unlock_bh(&sch->dev->queue_lock);
375 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
377 struct netem_sched_data *q = qdisc_priv(sch);
378 const struct tc_netem_corr *c = RTA_DATA(attr);
380 if (RTA_PAYLOAD(attr) != sizeof(*c))
383 init_crandom(&q->delay_cor, c->delay_corr);
384 init_crandom(&q->loss_cor, c->loss_corr);
385 init_crandom(&q->dup_cor, c->dup_corr);
389 static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
391 struct netem_sched_data *q = qdisc_priv(sch);
392 const struct tc_netem_reorder *r = RTA_DATA(attr);
394 if (RTA_PAYLOAD(attr) != sizeof(*r))
397 q->reorder = r->probability;
398 init_crandom(&q->reorder_cor, r->correlation);
402 static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
404 struct netem_sched_data *q = qdisc_priv(sch);
405 const struct tc_netem_corrupt *r = RTA_DATA(attr);
407 if (RTA_PAYLOAD(attr) != sizeof(*r))
410 q->corrupt = r->probability;
411 init_crandom(&q->corrupt_cor, r->correlation);
415 /* Parse netlink message to set options */
416 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
418 struct netem_sched_data *q = qdisc_priv(sch);
419 struct tc_netem_qopt *qopt;
422 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
425 qopt = RTA_DATA(opt);
426 ret = set_fifo_limit(q->qdisc, qopt->limit);
428 pr_debug("netem: can't set fifo limit\n");
432 q->latency = qopt->latency;
433 q->jitter = qopt->jitter;
434 q->limit = qopt->limit;
437 q->loss = qopt->loss;
438 q->duplicate = qopt->duplicate;
440 /* for compatiablity with earlier versions.
441 * if gap is set, need to assume 100% probablity
445 /* Handle nested options after initial queue options.
446 * Should have put all options in nested format but too late now.
448 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
449 struct rtattr *tb[TCA_NETEM_MAX];
450 if (rtattr_parse(tb, TCA_NETEM_MAX,
451 RTA_DATA(opt) + sizeof(*qopt),
452 RTA_PAYLOAD(opt) - sizeof(*qopt)))
455 if (tb[TCA_NETEM_CORR-1]) {
456 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
461 if (tb[TCA_NETEM_DELAY_DIST-1]) {
462 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
467 if (tb[TCA_NETEM_REORDER-1]) {
468 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
473 if (tb[TCA_NETEM_CORRUPT-1]) {
474 ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
484 * Special case version of FIFO queue for use by netem.
485 * It queues in order based on timestamps in skb's
487 struct fifo_sched_data {
491 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
493 struct fifo_sched_data *q = qdisc_priv(sch);
494 struct sk_buff_head *list = &sch->q;
495 const struct netem_skb_cb *ncb
496 = (const struct netem_skb_cb *)nskb->cb;
499 if (likely(skb_queue_len(list) < q->limit)) {
500 skb_queue_reverse_walk(list, skb) {
501 const struct netem_skb_cb *cb
502 = (const struct netem_skb_cb *)skb->cb;
504 if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send))
508 __skb_queue_after(list, skb, nskb);
510 sch->qstats.backlog += nskb->len;
511 sch->bstats.bytes += nskb->len;
512 sch->bstats.packets++;
514 return NET_XMIT_SUCCESS;
517 return qdisc_drop(nskb, sch);
520 static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
522 struct fifo_sched_data *q = qdisc_priv(sch);
525 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
526 if (RTA_PAYLOAD(opt) < sizeof(*ctl))
529 q->limit = ctl->limit;
531 q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
536 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
538 struct fifo_sched_data *q = qdisc_priv(sch);
539 struct tc_fifo_qopt opt = { .limit = q->limit };
541 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
548 static struct Qdisc_ops tfifo_qdisc_ops = {
550 .priv_size = sizeof(struct fifo_sched_data),
551 .enqueue = tfifo_enqueue,
552 .dequeue = qdisc_dequeue_head,
553 .requeue = qdisc_requeue,
554 .drop = qdisc_queue_drop,
556 .reset = qdisc_reset_queue,
557 .change = tfifo_init,
561 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
563 struct netem_sched_data *q = qdisc_priv(sch);
569 init_timer(&q->timer);
570 q->timer.function = netem_watchdog;
571 q->timer.data = (unsigned long) sch;
573 q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
575 pr_debug("netem: qdisc create failed\n");
579 ret = netem_change(sch, opt);
581 pr_debug("netem: change failed\n");
582 qdisc_destroy(q->qdisc);
587 static void netem_destroy(struct Qdisc *sch)
589 struct netem_sched_data *q = qdisc_priv(sch);
591 del_timer_sync(&q->timer);
592 qdisc_destroy(q->qdisc);
593 kfree(q->delay_dist);
596 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
598 const struct netem_sched_data *q = qdisc_priv(sch);
599 unsigned char *b = skb->tail;
600 struct rtattr *rta = (struct rtattr *) b;
601 struct tc_netem_qopt qopt;
602 struct tc_netem_corr cor;
603 struct tc_netem_reorder reorder;
604 struct tc_netem_corrupt corrupt;
606 qopt.latency = q->latency;
607 qopt.jitter = q->jitter;
608 qopt.limit = q->limit;
611 qopt.duplicate = q->duplicate;
612 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
614 cor.delay_corr = q->delay_cor.rho;
615 cor.loss_corr = q->loss_cor.rho;
616 cor.dup_corr = q->dup_cor.rho;
617 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
619 reorder.probability = q->reorder;
620 reorder.correlation = q->reorder_cor.rho;
621 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
623 corrupt.probability = q->corrupt;
624 corrupt.correlation = q->corrupt_cor.rho;
625 RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
627 rta->rta_len = skb->tail - b;
632 skb_trim(skb, b - skb->data);
636 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
637 struct sk_buff *skb, struct tcmsg *tcm)
639 struct netem_sched_data *q = qdisc_priv(sch);
641 if (cl != 1) /* only one class */
644 tcm->tcm_handle |= TC_H_MIN(1);
645 tcm->tcm_info = q->qdisc->handle;
650 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
653 struct netem_sched_data *q = qdisc_priv(sch);
659 *old = xchg(&q->qdisc, new);
662 sch_tree_unlock(sch);
667 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
669 struct netem_sched_data *q = qdisc_priv(sch);
673 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
678 static void netem_put(struct Qdisc *sch, unsigned long arg)
682 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
683 struct rtattr **tca, unsigned long *arg)
688 static int netem_delete(struct Qdisc *sch, unsigned long arg)
693 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
696 if (walker->count >= walker->skip)
697 if (walker->fn(sch, 1, walker) < 0) {
705 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
710 static struct Qdisc_class_ops netem_class_ops = {
711 .graft = netem_graft,
715 .change = netem_change_class,
716 .delete = netem_delete,
718 .tcf_chain = netem_find_tcf,
719 .dump = netem_dump_class,
722 static struct Qdisc_ops netem_qdisc_ops = {
724 .cl_ops = &netem_class_ops,
725 .priv_size = sizeof(struct netem_sched_data),
726 .enqueue = netem_enqueue,
727 .dequeue = netem_dequeue,
728 .requeue = netem_requeue,
731 .reset = netem_reset,
732 .destroy = netem_destroy,
733 .change = netem_change,
735 .owner = THIS_MODULE,
739 static int __init netem_module_init(void)
741 pr_info("netem: version " VERSION "\n");
742 return register_qdisc(&netem_qdisc_ops);
744 static void __exit netem_module_exit(void)
746 unregister_qdisc(&netem_qdisc_ops);
748 module_init(netem_module_init)
749 module_exit(netem_module_exit)
750 MODULE_LICENSE("GPL");