net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26
  27 #include <net/netlink.h>
  28 #include <net/pkt_sched.h>
  29 #include <net/inet_ecn.h>
  30
  31 #define VERSION "1.3"
  32
  33 /*      Network Emulation Queuing algorithm.
  34         ====================================
  35
  36         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  37                  Network Emulation Tool
  38                  [2] Luigi Rizzo, DummyNet for FreeBSD
  39
  40          ----------------------------------------------------------------
  41
  42          This started out as a simple way to delay outgoing packets to
  43          test TCP but has grown to include most of the functionality
  44          of a full blown network emulator like NISTnet. It can delay
  45          packets and add random jitter (and correlation). The random
  46          distribution can be loaded from a table as well to provide
  47          normal, Pareto, or experimental curves. Packet loss,
  48          duplication, and reordering can also be emulated.
  49
  50          This qdisc does not do classification that can be handled in
  51          layering other disciplines.  It does not need to do bandwidth
  52          control either since that can be handled by using token
  53          bucket or other rate control.
  54
  55      Correlated Loss Generator models
  56
  57         Added generation of correlated loss according to the
  58         "Gilbert-Elliot" model, a 4-state markov model.
  59
  60         References:
  61         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  62         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  63         and intuitive loss model for packet networks and its implementation
  64         in the Netem module in the Linux kernel", available in [1]
  65
  66         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  67                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  68 */
  69
  70 struct netem_sched_data {
  71         /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
  72
  73         /* optional qdisc for classful handling (NULL at netem init) */
  74         struct Qdisc    *qdisc;
  75
  76         struct qdisc_watchdog watchdog;
  77
  78         psched_tdiff_t latency;
  79         psched_tdiff_t jitter;
  80
  81         u32 loss;
  82         u32 ecn;
  83         u32 limit;
  84         u32 counter;
  85         u32 gap;
  86         u32 duplicate;
  87         u32 reorder;
  88         u32 corrupt;
  89         u32 rate;
  90         s32 packet_overhead;
  91         u32 cell_size;
  92         u32 cell_size_reciprocal;
  93         s32 cell_overhead;
  94
  95         struct crndstate {
  96                 u32 last;
  97                 u32 rho;
  98         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  99
 100         struct disttable {
 101                 u32  size;
 102                 s16 table[0];
 103         } *delay_dist;
 104
 105         enum  {
 106                 CLG_RANDOM,
 107                 CLG_4_STATES,
 108                 CLG_GILB_ELL,
 109         } loss_model;
 110
 111         /* Correlated Loss Generation models */
 112         struct clgstate {
 113                 /* state of the Markov chain */
 114                 u8 state;
 115
 116                 /* 4-states and Gilbert-Elliot models */
 117                 u32 a1; /* p13 for 4-states or p for GE */
 118                 u32 a2; /* p31 for 4-states or r for GE */
 119                 u32 a3; /* p32 for 4-states or h for GE */
 120                 u32 a4; /* p14 for 4-states or 1-k for GE */
 121                 u32 a5; /* p23 used only in 4-states */
 122         } clg;
 123
 124 };
 125
 126 /* Time stamp put into socket buffer control block
 127  * Only valid when skbs are in our internal t(ime)fifo queue.
 128  */
 129 struct netem_skb_cb {
 130         psched_time_t   time_to_send;
 131 };
 132
 133 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 134 {
 135         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 136         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 137 }
 138
 139 /* init_crandom - initialize correlated random number generator
 140  * Use entropy source for initial seed.
 141  */
 142 static void init_crandom(struct crndstate *state, unsigned long rho)
 143 {
 144         state->rho = rho;
 145         state->last = net_random();
 146 }
 147
 148 /* get_crandom - correlated random number generator
 149  * Next number depends on last value.
 150  * rho is scaled to avoid floating point.
 151  */
 152 static u32 get_crandom(struct crndstate *state)
 153 {
 154         u64 value, rho;
 155         unsigned long answer;
 156
 157         if (state->rho == 0)    /* no correlation */
 158                 return net_random();
 159
 160         value = net_random();
 161         rho = (u64)state->rho + 1;
 162         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 163         state->last = answer;
 164         return answer;
 165 }
 166
 167 /* loss_4state - 4-state model loss generator
 168  * Generates losses according to the 4-state Markov chain adopted in
 169  * the GI (General and Intuitive) loss model.
 170  */
 171 static bool loss_4state(struct netem_sched_data *q)
 172 {
 173         struct clgstate *clg = &q->clg;
 174         u32 rnd = net_random();
 175
 176         /*
 177          * Makes a comparison between rnd and the transition
 178          * probabilities outgoing from the current state, then decides the
 179          * next state and if the next packet has to be transmitted or lost.
 180          * The four states correspond to:
 181          *   1 => successfully transmitted packets within a gap period
 182          *   4 => isolated losses within a gap period
 183          *   3 => lost packets within a burst period
 184          *   2 => successfully transmitted packets within a burst period
 185          */
 186         switch (clg->state) {
 187         case 1:
 188                 if (rnd < clg->a4) {
 189                         clg->state = 4;
 190                         return true;
 191                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 192                         clg->state = 3;
 193                         return true;
 194                 } else if (clg->a1 < rnd)
 195                         clg->state = 1;
 196
 197                 break;
 198         case 2:
 199                 if (rnd < clg->a5) {
 200                         clg->state = 3;
 201                         return true;
 202                 } else
 203                         clg->state = 2;
 204
 205                 break;
 206         case 3:
 207                 if (rnd < clg->a3)
 208                         clg->state = 2;
 209                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 210                         clg->state = 1;
 211                         return true;
 212                 } else if (clg->a2 + clg->a3 < rnd) {
 213                         clg->state = 3;
 214                         return true;
 215                 }
 216                 break;
 217         case 4:
 218                 clg->state = 1;
 219                 break;
 220         }
 221
 222         return false;
 223 }
 224
 225 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 226  * Generates losses according to the Gilbert-Elliot loss model or
 227  * its special cases  (Gilbert or Simple Gilbert)
 228  *
 229  * Makes a comparison between random number and the transition
 230  * probabilities outgoing from the current state, then decides the
 231  * next state. A second random number is extracted and the comparison
 232  * with the loss probability of the current state decides if the next
 233  * packet will be transmitted or lost.
 234  */
 235 static bool loss_gilb_ell(struct netem_sched_data *q)
 236 {
 237         struct clgstate *clg = &q->clg;
 238
 239         switch (clg->state) {
 240         case 1:
 241                 if (net_random() < clg->a1)
 242                         clg->state = 2;
 243                 if (net_random() < clg->a4)
 244                         return true;
 245         case 2:
 246                 if (net_random() < clg->a2)
 247                         clg->state = 1;
 248                 if (clg->a3 > net_random())
 249                         return true;
 250         }
 251
 252         return false;
 253 }
 254
 255 static bool loss_event(struct netem_sched_data *q)
 256 {
 257         switch (q->loss_model) {
 258         case CLG_RANDOM:
 259                 /* Random packet drop 0 => none, ~0 => all */
 260                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 261
 262         case CLG_4_STATES:
 263                 /* 4state loss model algorithm (used also for GI model)
 264                 * Extracts a value from the markov 4 state loss generator,
 265                 * if it is 1 drops a packet and if needed writes the event in
 266                 * the kernel logs
 267                 */
 268                 return loss_4state(q);
 269
 270         case CLG_GILB_ELL:
 271                 /* Gilbert-Elliot loss model algorithm
 272                 * Extracts a value from the Gilbert-Elliot loss generator,
 273                 * if it is 1 drops a packet and if needed writes the event in
 274                 * the kernel logs
 275                 */
 276                 return loss_gilb_ell(q);
 277         }
 278
 279         return false;   /* not reached */
 280 }
 281
 282
 283 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 284  * std deviation sigma.  Uses table lookup to approximate the desired
 285  * distribution, and a uniformly-distributed pseudo-random source.
 286  */
 287 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 288                                 struct crndstate *state,
 289                                 const struct disttable *dist)
 290 {
 291         psched_tdiff_t x;
 292         long t;
 293         u32 rnd;
 294
 295         if (sigma == 0)
 296                 return mu;
 297
 298         rnd = get_crandom(state);
 299
 300         /* default uniform distribution */
 301         if (dist == NULL)
 302                 return (rnd % (2*sigma)) - sigma + mu;
 303
 304         t = dist->table[rnd % dist->size];
 305         x = (sigma % NETEM_DIST_SCALE) * t;
 306         if (x >= 0)
 307                 x += NETEM_DIST_SCALE/2;
 308         else
 309                 x -= NETEM_DIST_SCALE/2;
 310
 311         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 312 }
 313
 314 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 315 {
 316         u64 ticks;
 317
 318         len += q->packet_overhead;
 319
 320         if (q->cell_size) {
 321                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 322
 323                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 324                         cells++;
 325                 len = cells * (q->cell_size + q->cell_overhead);
 326         }
 327
 328         ticks = (u64)len * NSEC_PER_SEC;
 329
 330         do_div(ticks, q->rate);
 331         return PSCHED_NS2TICKS(ticks);
 332 }
 333
 334 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 335 {
 336         struct sk_buff_head *list = &sch->q;
 337         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 338         struct sk_buff *skb = skb_peek_tail(list);
 339
 340         /* Optimize for add at tail */
 341         if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
 342                 return __skb_queue_tail(list, nskb);
 343
 344         skb_queue_reverse_walk(list, skb) {
 345                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 346                         break;
 347         }
 348
 349         __skb_queue_after(list, skb, nskb);
 350 }
 351
 352 /*
 353  * Insert one skb into qdisc.
 354  * Note: parent depends on return value to account for queue length.
 355  *      NET_XMIT_DROP: queue length didn't change.
 356  *      NET_XMIT_SUCCESS: one skb was queued.
 357  */
 358 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 359 {
 360         struct netem_sched_data *q = qdisc_priv(sch);
 361         /* We don't fill cb now as skb_unshare() may invalidate it */
 362         struct netem_skb_cb *cb;
 363         struct sk_buff *skb2;
 364         int count = 1;
 365
 366         /* Random duplication */
 367         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 368                 ++count;
 369
 370         /* Drop packet? */
 371         if (loss_event(q)) {
 372                 if (q->ecn && INET_ECN_set_ce(skb))
 373                         sch->qstats.drops++; /* mark packet */
 374                 else
 375                         --count;
 376         }
 377         if (count == 0) {
 378                 sch->qstats.drops++;
 379                 kfree_skb(skb);
 380                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 381         }
 382
 383         /* If a delay is expected, orphan the skb. (orphaning usually takes
 384          * place at TX completion time, so _before_ the link transit delay)
 385          * Ideally, this orphaning should be done after the rate limiting
 386          * module, because this breaks TCP Small Queue, and other mechanisms
 387          * based on socket sk_wmem_alloc.
 388          */
 389         if (q->latency || q->jitter)
 390                 skb_orphan(skb);
 391
 392         /*
 393          * If we need to duplicate packet, then re-insert at top of the
 394          * qdisc tree, since parent queuer expects that only one
 395          * skb will be queued.
 396          */
 397         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 398                 struct Qdisc *rootq = qdisc_root(sch);
 399                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 400                 q->duplicate = 0;
 401
 402                 qdisc_enqueue_root(skb2, rootq);
 403                 q->duplicate = dupsave;
 404         }
 405
 406         /*
 407          * Randomized packet corruption.
 408          * Make copy if needed since we are modifying
 409          * If packet is going to be hardware checksummed, then
 410          * do it now in software before we mangle it.
 411          */
 412         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 413                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 414                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 415                      skb_checksum_help(skb)))
 416                         return qdisc_drop(skb, sch);
 417
 418                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 419         }
 420
 421         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 422                 return qdisc_reshape_fail(skb, sch);
 423
 424         sch->qstats.backlog += qdisc_pkt_len(skb);
 425
 426         cb = netem_skb_cb(skb);
 427         if (q->gap == 0 ||              /* not doing reordering */
 428             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 429             q->reorder < get_crandom(&q->reorder_cor)) {
 430                 psched_time_t now;
 431                 psched_tdiff_t delay;
 432
 433                 delay = tabledist(q->latency, q->jitter,
 434                                   &q->delay_cor, q->delay_dist);
 435
 436                 now = psched_get_time();
 437
 438                 if (q->rate) {
 439                         struct sk_buff_head *list = &sch->q;
 440
 441                         if (!skb_queue_empty(list)) {
 442                                 /*
 443                                  * Last packet in queue is reference point (now),
 444                                  * calculate this time bonus and subtract
 445                                  * from delay.
 446                                  */
 447                                 delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
 448                                 delay = max_t(psched_tdiff_t, 0, delay);
 449                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 450                         }
 451
 452                         delay += packet_len_2_sched_time(skb->len, q);
 453                 }
 454
 455                 cb->time_to_send = now + delay;
 456                 ++q->counter;
 457                 tfifo_enqueue(skb, sch);
 458         } else {
 459                 /*
 460                  * Do re-ordering by putting one out of N packets at the front
 461                  * of the queue.
 462                  */
 463                 cb->time_to_send = psched_get_time();
 464                 q->counter = 0;
 465
 466                 __skb_queue_head(&sch->q, skb);
 467                 sch->qstats.requeues++;
 468         }
 469
 470         return NET_XMIT_SUCCESS;
 471 }
 472
 473 static unsigned int netem_drop(struct Qdisc *sch)
 474 {
 475         struct netem_sched_data *q = qdisc_priv(sch);
 476         unsigned int len;
 477
 478         len = qdisc_queue_drop(sch);
 479         if (!len && q->qdisc && q->qdisc->ops->drop)
 480             len = q->qdisc->ops->drop(q->qdisc);
 481         if (len)
 482                 sch->qstats.drops++;
 483
 484         return len;
 485 }
 486
 487 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 488 {
 489         struct netem_sched_data *q = qdisc_priv(sch);
 490         struct sk_buff *skb;
 491
 492         if (qdisc_is_throttled(sch))
 493                 return NULL;
 494
 495 tfifo_dequeue:
 496         skb = qdisc_peek_head(sch);
 497         if (skb) {
 498                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 499
 500                 /* if more time remaining? */
 501                 if (cb->time_to_send <= psched_get_time()) {
 502                         __skb_unlink(skb, &sch->q);
 503                         sch->qstats.backlog -= qdisc_pkt_len(skb);
 504
 505 #ifdef CONFIG_NET_CLS_ACT
 506                         /*
 507                          * If it's at ingress let's pretend the delay is
 508                          * from the network (tstamp will be updated).
 509                          */
 510                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 511                                 skb->tstamp.tv64 = 0;
 512 #endif
 513
 514                         if (q->qdisc) {
 515                                 int err = qdisc_enqueue(skb, q->qdisc);
 516
 517                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 518                                         if (net_xmit_drop_count(err)) {
 519                                                 sch->qstats.drops++;
 520                                                 qdisc_tree_decrease_qlen(sch, 1);
 521                                         }
 522                                 }
 523                                 goto tfifo_dequeue;
 524                         }
 525 deliver:
 526                         qdisc_unthrottled(sch);
 527                         qdisc_bstats_update(sch, skb);
 528                         return skb;
 529                 }
 530
 531                 if (q->qdisc) {
 532                         skb = q->qdisc->ops->dequeue(q->qdisc);
 533                         if (skb)
 534                                 goto deliver;
 535                 }
 536                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 537         }
 538
 539         if (q->qdisc) {
 540                 skb = q->qdisc->ops->dequeue(q->qdisc);
 541                 if (skb)
 542                         goto deliver;
 543         }
 544         return NULL;
 545 }
 546
 547 static void netem_reset(struct Qdisc *sch)
 548 {
 549         struct netem_sched_data *q = qdisc_priv(sch);
 550
 551         qdisc_reset_queue(sch);
 552         if (q->qdisc)
 553                 qdisc_reset(q->qdisc);
 554         qdisc_watchdog_cancel(&q->watchdog);
 555 }
 556
 557 static void dist_free(struct disttable *d)
 558 {
 559         if (d) {
 560                 if (is_vmalloc_addr(d))
 561                         vfree(d);
 562                 else
 563                         kfree(d);
 564         }
 565 }
 566
 567 /*
 568  * Distribution data is a variable size payload containing
 569  * signed 16 bit values.
 570  */
 571 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 572 {
 573         struct netem_sched_data *q = qdisc_priv(sch);
 574         size_t n = nla_len(attr)/sizeof(__s16);
 575         const __s16 *data = nla_data(attr);
 576         spinlock_t *root_lock;
 577         struct disttable *d;
 578         int i;
 579         size_t s;
 580
 581         if (n > NETEM_DIST_MAX)
 582                 return -EINVAL;
 583
 584         s = sizeof(struct disttable) + n * sizeof(s16);
 585         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 586         if (!d)
 587                 d = vmalloc(s);
 588         if (!d)
 589                 return -ENOMEM;
 590
 591         d->size = n;
 592         for (i = 0; i < n; i++)
 593                 d->table[i] = data[i];
 594
 595         root_lock = qdisc_root_sleeping_lock(sch);
 596
 597         spin_lock_bh(root_lock);
 598         swap(q->delay_dist, d);
 599         spin_unlock_bh(root_lock);
 600
 601         dist_free(d);
 602         return 0;
 603 }
 604
 605 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 606 {
 607         struct netem_sched_data *q = qdisc_priv(sch);
 608         const struct tc_netem_corr *c = nla_data(attr);
 609
 610         init_crandom(&q->delay_cor, c->delay_corr);
 611         init_crandom(&q->loss_cor, c->loss_corr);
 612         init_crandom(&q->dup_cor, c->dup_corr);
 613 }
 614
 615 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 616 {
 617         struct netem_sched_data *q = qdisc_priv(sch);
 618         const struct tc_netem_reorder *r = nla_data(attr);
 619
 620         q->reorder = r->probability;
 621         init_crandom(&q->reorder_cor, r->correlation);
 622 }
 623
 624 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 625 {
 626         struct netem_sched_data *q = qdisc_priv(sch);
 627         const struct tc_netem_corrupt *r = nla_data(attr);
 628
 629         q->corrupt = r->probability;
 630         init_crandom(&q->corrupt_cor, r->correlation);
 631 }
 632
 633 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 634 {
 635         struct netem_sched_data *q = qdisc_priv(sch);
 636         const struct tc_netem_rate *r = nla_data(attr);
 637
 638         q->rate = r->rate;
 639         q->packet_overhead = r->packet_overhead;
 640         q->cell_size = r->cell_size;
 641         if (q->cell_size)
 642                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 643         q->cell_overhead = r->cell_overhead;
 644 }
 645
 646 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 647 {
 648         struct netem_sched_data *q = qdisc_priv(sch);
 649         const struct nlattr *la;
 650         int rem;
 651
 652         nla_for_each_nested(la, attr, rem) {
 653                 u16 type = nla_type(la);
 654
 655                 switch(type) {
 656                 case NETEM_LOSS_GI: {
 657                         const struct tc_netem_gimodel *gi = nla_data(la);
 658
 659                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 660                                 pr_info("netem: incorrect gi model size\n");
 661                                 return -EINVAL;
 662                         }
 663
 664                         q->loss_model = CLG_4_STATES;
 665
 666                         q->clg.state = 1;
 667                         q->clg.a1 = gi->p13;
 668                         q->clg.a2 = gi->p31;
 669                         q->clg.a3 = gi->p32;
 670                         q->clg.a4 = gi->p14;
 671                         q->clg.a5 = gi->p23;
 672                         break;
 673                 }
 674
 675                 case NETEM_LOSS_GE: {
 676                         const struct tc_netem_gemodel *ge = nla_data(la);
 677
 678                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 679                                 pr_info("netem: incorrect ge model size\n");
 680                                 return -EINVAL;
 681                         }
 682
 683                         q->loss_model = CLG_GILB_ELL;
 684                         q->clg.state = 1;
 685                         q->clg.a1 = ge->p;
 686                         q->clg.a2 = ge->r;
 687                         q->clg.a3 = ge->h;
 688                         q->clg.a4 = ge->k1;
 689                         break;
 690                 }
 691
 692                 default:
 693                         pr_info("netem: unknown loss type %u\n", type);
 694                         return -EINVAL;
 695                 }
 696         }
 697
 698         return 0;
 699 }
 700
 701 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 702         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 703         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 704         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 705         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 706         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 707         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 708 };
 709
 710 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 711                       const struct nla_policy *policy, int len)
 712 {
 713         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 714
 715         if (nested_len < 0) {
 716                 pr_info("netem: invalid attributes len %d\n", nested_len);
 717                 return -EINVAL;
 718         }
 719
 720         if (nested_len >= nla_attr_size(0))
 721                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 722                                  nested_len, policy);
 723
 724         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 725         return 0;
 726 }
 727
 728 /* Parse netlink message to set options */
 729 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 730 {
 731         struct netem_sched_data *q = qdisc_priv(sch);
 732         struct nlattr *tb[TCA_NETEM_MAX + 1];
 733         struct tc_netem_qopt *qopt;
 734         int ret;
 735
 736         if (opt == NULL)
 737                 return -EINVAL;
 738
 739         qopt = nla_data(opt);
 740         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 741         if (ret < 0)
 742                 return ret;
 743
 744         sch->limit = qopt->limit;
 745
 746         q->latency = qopt->latency;
 747         q->jitter = qopt->jitter;
 748         q->limit = qopt->limit;
 749         q->gap = qopt->gap;
 750         q->counter = 0;
 751         q->loss = qopt->loss;
 752         q->duplicate = qopt->duplicate;
 753
 754         /* for compatibility with earlier versions.
 755          * if gap is set, need to assume 100% probability
 756          */
 757         if (q->gap)
 758                 q->reorder = ~0;
 759
 760         if (tb[TCA_NETEM_CORR])
 761                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 762
 763         if (tb[TCA_NETEM_DELAY_DIST]) {
 764                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 765                 if (ret)
 766                         return ret;
 767         }
 768
 769         if (tb[TCA_NETEM_REORDER])
 770                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 771
 772         if (tb[TCA_NETEM_CORRUPT])
 773                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 774
 775         if (tb[TCA_NETEM_RATE])
 776                 get_rate(sch, tb[TCA_NETEM_RATE]);
 777
 778         if (tb[TCA_NETEM_ECN])
 779                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 780
 781         q->loss_model = CLG_RANDOM;
 782         if (tb[TCA_NETEM_LOSS])
 783                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 784
 785         return ret;
 786 }
 787
 788 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 789 {
 790         struct netem_sched_data *q = qdisc_priv(sch);
 791         int ret;
 792
 793         if (!opt)
 794                 return -EINVAL;
 795
 796         qdisc_watchdog_init(&q->watchdog, sch);
 797
 798         q->loss_model = CLG_RANDOM;
 799         ret = netem_change(sch, opt);
 800         if (ret)
 801                 pr_info("netem: change failed\n");
 802         return ret;
 803 }
 804
 805 static void netem_destroy(struct Qdisc *sch)
 806 {
 807         struct netem_sched_data *q = qdisc_priv(sch);
 808
 809         qdisc_watchdog_cancel(&q->watchdog);
 810         if (q->qdisc)
 811                 qdisc_destroy(q->qdisc);
 812         dist_free(q->delay_dist);
 813 }
 814
 815 static int dump_loss_model(const struct netem_sched_data *q,
 816                            struct sk_buff *skb)
 817 {
 818         struct nlattr *nest;
 819
 820         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 821         if (nest == NULL)
 822                 goto nla_put_failure;
 823
 824         switch (q->loss_model) {
 825         case CLG_RANDOM:
 826                 /* legacy loss model */
 827                 nla_nest_cancel(skb, nest);
 828                 return 0;       /* no data */
 829
 830         case CLG_4_STATES: {
 831                 struct tc_netem_gimodel gi = {
 832                         .p13 = q->clg.a1,
 833                         .p31 = q->clg.a2,
 834                         .p32 = q->clg.a3,
 835                         .p14 = q->clg.a4,
 836                         .p23 = q->clg.a5,
 837                 };
 838
 839                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 840                         goto nla_put_failure;
 841                 break;
 842         }
 843         case CLG_GILB_ELL: {
 844                 struct tc_netem_gemodel ge = {
 845                         .p = q->clg.a1,
 846                         .r = q->clg.a2,
 847                         .h = q->clg.a3,
 848                         .k1 = q->clg.a4,
 849                 };
 850
 851                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 852                         goto nla_put_failure;
 853                 break;
 854         }
 855         }
 856
 857         nla_nest_end(skb, nest);
 858         return 0;
 859
 860 nla_put_failure:
 861         nla_nest_cancel(skb, nest);
 862         return -1;
 863 }
 864
 865 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 866 {
 867         const struct netem_sched_data *q = qdisc_priv(sch);
 868         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 869         struct tc_netem_qopt qopt;
 870         struct tc_netem_corr cor;
 871         struct tc_netem_reorder reorder;
 872         struct tc_netem_corrupt corrupt;
 873         struct tc_netem_rate rate;
 874
 875         qopt.latency = q->latency;
 876         qopt.jitter = q->jitter;
 877         qopt.limit = q->limit;
 878         qopt.loss = q->loss;
 879         qopt.gap = q->gap;
 880         qopt.duplicate = q->duplicate;
 881         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 882                 goto nla_put_failure;
 883
 884         cor.delay_corr = q->delay_cor.rho;
 885         cor.loss_corr = q->loss_cor.rho;
 886         cor.dup_corr = q->dup_cor.rho;
 887         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 888                 goto nla_put_failure;
 889
 890         reorder.probability = q->reorder;
 891         reorder.correlation = q->reorder_cor.rho;
 892         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 893                 goto nla_put_failure;
 894
 895         corrupt.probability = q->corrupt;
 896         corrupt.correlation = q->corrupt_cor.rho;
 897         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 898                 goto nla_put_failure;
 899
 900         rate.rate = q->rate;
 901         rate.packet_overhead = q->packet_overhead;
 902         rate.cell_size = q->cell_size;
 903         rate.cell_overhead = q->cell_overhead;
 904         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 905                 goto nla_put_failure;
 906
 907         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 908                 goto nla_put_failure;
 909
 910         if (dump_loss_model(q, skb) != 0)
 911                 goto nla_put_failure;
 912
 913         return nla_nest_end(skb, nla);
 914
 915 nla_put_failure:
 916         nlmsg_trim(skb, nla);
 917         return -1;
 918 }
 919
 920 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 921                           struct sk_buff *skb, struct tcmsg *tcm)
 922 {
 923         struct netem_sched_data *q = qdisc_priv(sch);
 924
 925         if (cl != 1 || !q->qdisc)       /* only one class */
 926                 return -ENOENT;
 927
 928         tcm->tcm_handle |= TC_H_MIN(1);
 929         tcm->tcm_info = q->qdisc->handle;
 930
 931         return 0;
 932 }
 933
 934 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 935                      struct Qdisc **old)
 936 {
 937         struct netem_sched_data *q = qdisc_priv(sch);
 938
 939         sch_tree_lock(sch);
 940         *old = q->qdisc;
 941         q->qdisc = new;
 942         if (*old) {
 943                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 944                 qdisc_reset(*old);
 945         }
 946         sch_tree_unlock(sch);
 947
 948         return 0;
 949 }
 950
 951 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 952 {
 953         struct netem_sched_data *q = qdisc_priv(sch);
 954         return q->qdisc;
 955 }
 956
 957 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 958 {
 959         return 1;
 960 }
 961
 962 static void netem_put(struct Qdisc *sch, unsigned long arg)
 963 {
 964 }
 965
 966 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 967 {
 968         if (!walker->stop) {
 969                 if (walker->count >= walker->skip)
 970                         if (walker->fn(sch, 1, walker) < 0) {
 971                                 walker->stop = 1;
 972                                 return;
 973                         }
 974                 walker->count++;
 975         }
 976 }
 977
 978 static const struct Qdisc_class_ops netem_class_ops = {
 979         .graft          =       netem_graft,
 980         .leaf           =       netem_leaf,
 981         .get            =       netem_get,
 982         .put            =       netem_put,
 983         .walk           =       netem_walk,
 984         .dump           =       netem_dump_class,
 985 };
 986
 987 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 988         .id             =       "netem",
 989         .cl_ops         =       &netem_class_ops,
 990         .priv_size      =       sizeof(struct netem_sched_data),
 991         .enqueue        =       netem_enqueue,
 992         .dequeue        =       netem_dequeue,
 993         .peek           =       qdisc_peek_dequeued,
 994         .drop           =       netem_drop,
 995         .init           =       netem_init,
 996         .reset          =       netem_reset,
 997         .destroy        =       netem_destroy,
 998         .change         =       netem_change,
 999         .dump           =       netem_dump,
1000         .owner          =       THIS_MODULE,
1001 };
1002
1003
1004 static int __init netem_module_init(void)
1005 {
1006         pr_info("netem: version " VERSION "\n");
1007         return register_qdisc(&netem_qdisc_ops);
1008 }
1009 static void __exit netem_module_exit(void)
1010 {
1011         unregister_qdisc(&netem_qdisc_ops);
1012 }
1013 module_init(netem_module_init)
1014 module_exit(netem_module_exit)
1015 MODULE_LICENSE("GPL");