net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct netem_sched_data {
  72         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  73         struct rb_root t_root;
  74
  75         /* optional qdisc for classful handling (NULL at netem init) */
  76         struct Qdisc    *qdisc;
  77
  78         struct qdisc_watchdog watchdog;
  79
  80         psched_tdiff_t latency;
  81         psched_tdiff_t jitter;
  82
  83         u32 loss;
  84         u32 ecn;
  85         u32 limit;
  86         u32 counter;
  87         u32 gap;
  88         u32 duplicate;
  89         u32 reorder;
  90         u32 corrupt;
  91         u64 rate;
  92         s32 packet_overhead;
  93         u32 cell_size;
  94         u32 cell_size_reciprocal;
  95         s32 cell_overhead;
  96
  97         struct crndstate {
  98                 u32 last;
  99                 u32 rho;
 100         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 101
 102         struct disttable {
 103                 u32  size;
 104                 s16 table[0];
 105         } *delay_dist;
 106
 107         enum  {
 108                 CLG_RANDOM,
 109                 CLG_4_STATES,
 110                 CLG_GILB_ELL,
 111         } loss_model;
 112
 113         /* Correlated Loss Generation models */
 114         struct clgstate {
 115                 /* state of the Markov chain */
 116                 u8 state;
 117
 118                 /* 4-states and Gilbert-Elliot models */
 119                 u32 a1; /* p13 for 4-states or p for GE */
 120                 u32 a2; /* p31 for 4-states or r for GE */
 121                 u32 a3; /* p32 for 4-states or h for GE */
 122                 u32 a4; /* p14 for 4-states or 1-k for GE */
 123                 u32 a5; /* p23 used only in 4-states */
 124         } clg;
 125
 126 };
 127
 128 /* Time stamp put into socket buffer control block
 129  * Only valid when skbs are in our internal t(ime)fifo queue.
 130  */
 131 struct netem_skb_cb {
 132         psched_time_t   time_to_send;
 133         ktime_t         tstamp_save;
 134 };
 135
 136 /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
 137  * to hold a rb_node structure.
 138  *
 139  * If struct sk_buff layout is changed, the following checks will complain.
 140  */
 141 static struct rb_node *netem_rb_node(struct sk_buff *skb)
 142 {
 143         BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
 144         BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
 145                      offsetof(struct sk_buff, next) + sizeof(skb->next));
 146         BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
 147                      offsetof(struct sk_buff, prev) + sizeof(skb->prev));
 148         BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
 149                                               sizeof(skb->prev) +
 150                                               sizeof(skb->tstamp));
 151         return (struct rb_node *)&skb->next;
 152 }
 153
 154 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
 155 {
 156         return (struct sk_buff *)rb;
 157 }
 158
 159 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 160 {
 161         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 162         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 163         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 164 }
 165
 166 /* init_crandom - initialize correlated random number generator
 167  * Use entropy source for initial seed.
 168  */
 169 static void init_crandom(struct crndstate *state, unsigned long rho)
 170 {
 171         state->rho = rho;
 172         state->last = prandom_u32();
 173 }
 174
 175 /* get_crandom - correlated random number generator
 176  * Next number depends on last value.
 177  * rho is scaled to avoid floating point.
 178  */
 179 static u32 get_crandom(struct crndstate *state)
 180 {
 181         u64 value, rho;
 182         unsigned long answer;
 183
 184         if (state->rho == 0)    /* no correlation */
 185                 return prandom_u32();
 186
 187         value = prandom_u32();
 188         rho = (u64)state->rho + 1;
 189         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 190         state->last = answer;
 191         return answer;
 192 }
 193
 194 /* loss_4state - 4-state model loss generator
 195  * Generates losses according to the 4-state Markov chain adopted in
 196  * the GI (General and Intuitive) loss model.
 197  */
 198 static bool loss_4state(struct netem_sched_data *q)
 199 {
 200         struct clgstate *clg = &q->clg;
 201         u32 rnd = prandom_u32();
 202
 203         /*
 204          * Makes a comparison between rnd and the transition
 205          * probabilities outgoing from the current state, then decides the
 206          * next state and if the next packet has to be transmitted or lost.
 207          * The four states correspond to:
 208          *   1 => successfully transmitted packets within a gap period
 209          *   4 => isolated losses within a gap period
 210          *   3 => lost packets within a burst period
 211          *   2 => successfully transmitted packets within a burst period
 212          */
 213         switch (clg->state) {
 214         case 1:
 215                 if (rnd < clg->a4) {
 216                         clg->state = 4;
 217                         return true;
 218                 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
 219                         clg->state = 3;
 220                         return true;
 221                 } else if (clg->a1 + clg->a4 < rnd)
 222                         clg->state = 1;
 223
 224                 break;
 225         case 2:
 226                 if (rnd < clg->a5) {
 227                         clg->state = 3;
 228                         return true;
 229                 } else
 230                         clg->state = 2;
 231
 232                 break;
 233         case 3:
 234                 if (rnd < clg->a3)
 235                         clg->state = 2;
 236                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 237                         clg->state = 1;
 238                 } else if (clg->a2 + clg->a3 < rnd) {
 239                         clg->state = 3;
 240                         return true;
 241                 }
 242                 break;
 243         case 4:
 244                 clg->state = 1;
 245                 break;
 246         }
 247
 248         return false;
 249 }
 250
 251 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 252  * Generates losses according to the Gilbert-Elliot loss model or
 253  * its special cases  (Gilbert or Simple Gilbert)
 254  *
 255  * Makes a comparison between random number and the transition
 256  * probabilities outgoing from the current state, then decides the
 257  * next state. A second random number is extracted and the comparison
 258  * with the loss probability of the current state decides if the next
 259  * packet will be transmitted or lost.
 260  */
 261 static bool loss_gilb_ell(struct netem_sched_data *q)
 262 {
 263         struct clgstate *clg = &q->clg;
 264
 265         switch (clg->state) {
 266         case 1:
 267                 if (prandom_u32() < clg->a1)
 268                         clg->state = 2;
 269                 if (prandom_u32() < clg->a4)
 270                         return true;
 271                 break;
 272         case 2:
 273                 if (prandom_u32() < clg->a2)
 274                         clg->state = 1;
 275                 if (prandom_u32() > clg->a3)
 276                         return true;
 277         }
 278
 279         return false;
 280 }
 281
 282 static bool loss_event(struct netem_sched_data *q)
 283 {
 284         switch (q->loss_model) {
 285         case CLG_RANDOM:
 286                 /* Random packet drop 0 => none, ~0 => all */
 287                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 288
 289         case CLG_4_STATES:
 290                 /* 4state loss model algorithm (used also for GI model)
 291                 * Extracts a value from the markov 4 state loss generator,
 292                 * if it is 1 drops a packet and if needed writes the event in
 293                 * the kernel logs
 294                 */
 295                 return loss_4state(q);
 296
 297         case CLG_GILB_ELL:
 298                 /* Gilbert-Elliot loss model algorithm
 299                 * Extracts a value from the Gilbert-Elliot loss generator,
 300                 * if it is 1 drops a packet and if needed writes the event in
 301                 * the kernel logs
 302                 */
 303                 return loss_gilb_ell(q);
 304         }
 305
 306         return false;   /* not reached */
 307 }
 308
 309
 310 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 311  * std deviation sigma.  Uses table lookup to approximate the desired
 312  * distribution, and a uniformly-distributed pseudo-random source.
 313  */
 314 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 315                                 struct crndstate *state,
 316                                 const struct disttable *dist)
 317 {
 318         psched_tdiff_t x;
 319         long t;
 320         u32 rnd;
 321
 322         if (sigma == 0)
 323                 return mu;
 324
 325         rnd = get_crandom(state);
 326
 327         /* default uniform distribution */
 328         if (dist == NULL)
 329                 return (rnd % (2*sigma)) - sigma + mu;
 330
 331         t = dist->table[rnd % dist->size];
 332         x = (sigma % NETEM_DIST_SCALE) * t;
 333         if (x >= 0)
 334                 x += NETEM_DIST_SCALE/2;
 335         else
 336                 x -= NETEM_DIST_SCALE/2;
 337
 338         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 339 }
 340
 341 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 342 {
 343         u64 ticks;
 344
 345         len += q->packet_overhead;
 346
 347         if (q->cell_size) {
 348                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 349
 350                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 351                         cells++;
 352                 len = cells * (q->cell_size + q->cell_overhead);
 353         }
 354
 355         ticks = (u64)len * NSEC_PER_SEC;
 356
 357         do_div(ticks, q->rate);
 358         return PSCHED_NS2TICKS(ticks);
 359 }
 360
 361 static void tfifo_reset(struct Qdisc *sch)
 362 {
 363         struct netem_sched_data *q = qdisc_priv(sch);
 364         struct rb_node *p;
 365
 366         while ((p = rb_first(&q->t_root))) {
 367                 struct sk_buff *skb = netem_rb_to_skb(p);
 368
 369                 rb_erase(p, &q->t_root);
 370                 skb->next = NULL;
 371                 skb->prev = NULL;
 372                 kfree_skb(skb);
 373         }
 374 }
 375
 376 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 377 {
 378         struct netem_sched_data *q = qdisc_priv(sch);
 379         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 380         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 381
 382         while (*p) {
 383                 struct sk_buff *skb;
 384
 385                 parent = *p;
 386                 skb = netem_rb_to_skb(parent);
 387                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 388                         p = &parent->rb_right;
 389                 else
 390                         p = &parent->rb_left;
 391         }
 392         rb_link_node(netem_rb_node(nskb), parent, p);
 393         rb_insert_color(netem_rb_node(nskb), &q->t_root);
 394         sch->q.qlen++;
 395 }
 396
 397 /*
 398  * Insert one skb into qdisc.
 399  * Note: parent depends on return value to account for queue length.
 400  *      NET_XMIT_DROP: queue length didn't change.
 401  *      NET_XMIT_SUCCESS: one skb was queued.
 402  */
 403 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 404 {
 405         struct netem_sched_data *q = qdisc_priv(sch);
 406         /* We don't fill cb now as skb_unshare() may invalidate it */
 407         struct netem_skb_cb *cb;
 408         struct sk_buff *skb2;
 409         int count = 1;
 410
 411         /* Random duplication */
 412         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 413                 ++count;
 414
 415         /* Drop packet? */
 416         if (loss_event(q)) {
 417                 if (q->ecn && INET_ECN_set_ce(skb))
 418                         sch->qstats.drops++; /* mark packet */
 419                 else
 420                         --count;
 421         }
 422         if (count == 0) {
 423                 sch->qstats.drops++;
 424                 kfree_skb(skb);
 425                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 426         }
 427
 428         /* If a delay is expected, orphan the skb. (orphaning usually takes
 429          * place at TX completion time, so _before_ the link transit delay)
 430          */
 431         if (q->latency || q->jitter)
 432                 skb_orphan_partial(skb);
 433
 434         /*
 435          * If we need to duplicate packet, then re-insert at top of the
 436          * qdisc tree, since parent queuer expects that only one
 437          * skb will be queued.
 438          */
 439         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 440                 struct Qdisc *rootq = qdisc_root(sch);
 441                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 442                 q->duplicate = 0;
 443
 444                 qdisc_enqueue_root(skb2, rootq);
 445                 q->duplicate = dupsave;
 446         }
 447
 448         /*
 449          * Randomized packet corruption.
 450          * Make copy if needed since we are modifying
 451          * If packet is going to be hardware checksummed, then
 452          * do it now in software before we mangle it.
 453          */
 454         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 455                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 456                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 457                      skb_checksum_help(skb)))
 458                         return qdisc_drop(skb, sch);
 459
 460                 skb->data[prandom_u32() % skb_headlen(skb)] ^=
 461                         1<<(prandom_u32() % 8);
 462         }
 463
 464         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 465                 return qdisc_reshape_fail(skb, sch);
 466
 467         sch->qstats.backlog += qdisc_pkt_len(skb);
 468
 469         cb = netem_skb_cb(skb);
 470         if (q->gap == 0 ||              /* not doing reordering */
 471             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 472             q->reorder < get_crandom(&q->reorder_cor)) {
 473                 psched_time_t now;
 474                 psched_tdiff_t delay;
 475
 476                 delay = tabledist(q->latency, q->jitter,
 477                                   &q->delay_cor, q->delay_dist);
 478
 479                 now = psched_get_time();
 480
 481                 if (q->rate) {
 482                         struct sk_buff *last;
 483
 484                         if (!skb_queue_empty(&sch->q))
 485                                 last = skb_peek_tail(&sch->q);
 486                         else
 487                                 last = netem_rb_to_skb(rb_last(&q->t_root));
 488                         if (last) {
 489                                 /*
 490                                  * Last packet in queue is reference point (now),
 491                                  * calculate this time bonus and subtract
 492                                  * from delay.
 493                                  */
 494                                 delay -= netem_skb_cb(last)->time_to_send - now;
 495                                 delay = max_t(psched_tdiff_t, 0, delay);
 496                                 now = netem_skb_cb(last)->time_to_send;
 497                         }
 498
 499                         delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
 500                 }
 501
 502                 cb->time_to_send = now + delay;
 503                 cb->tstamp_save = skb->tstamp;
 504                 ++q->counter;
 505                 tfifo_enqueue(skb, sch);
 506         } else {
 507                 /*
 508                  * Do re-ordering by putting one out of N packets at the front
 509                  * of the queue.
 510                  */
 511                 cb->time_to_send = psched_get_time();
 512                 q->counter = 0;
 513
 514                 __skb_queue_head(&sch->q, skb);
 515                 sch->qstats.requeues++;
 516         }
 517
 518         return NET_XMIT_SUCCESS;
 519 }
 520
 521 static unsigned int netem_drop(struct Qdisc *sch)
 522 {
 523         struct netem_sched_data *q = qdisc_priv(sch);
 524         unsigned int len;
 525
 526         len = qdisc_queue_drop(sch);
 527
 528         if (!len) {
 529                 struct rb_node *p = rb_first(&q->t_root);
 530
 531                 if (p) {
 532                         struct sk_buff *skb = netem_rb_to_skb(p);
 533
 534                         rb_erase(p, &q->t_root);
 535                         sch->q.qlen--;
 536                         skb->next = NULL;
 537                         skb->prev = NULL;
 538                         len = qdisc_pkt_len(skb);
 539                         sch->qstats.backlog -= len;
 540                         kfree_skb(skb);
 541                 }
 542         }
 543         if (!len && q->qdisc && q->qdisc->ops->drop)
 544             len = q->qdisc->ops->drop(q->qdisc);
 545         if (len)
 546                 sch->qstats.drops++;
 547
 548         return len;
 549 }
 550
 551 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 552 {
 553         struct netem_sched_data *q = qdisc_priv(sch);
 554         struct sk_buff *skb;
 555         struct rb_node *p;
 556
 557         if (qdisc_is_throttled(sch))
 558                 return NULL;
 559
 560 tfifo_dequeue:
 561         skb = __skb_dequeue(&sch->q);
 562         if (skb) {
 563 deliver:
 564                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 565                 qdisc_unthrottled(sch);
 566                 qdisc_bstats_update(sch, skb);
 567                 return skb;
 568         }
 569         p = rb_first(&q->t_root);
 570         if (p) {
 571                 psched_time_t time_to_send;
 572
 573                 skb = netem_rb_to_skb(p);
 574
 575                 /* if more time remaining? */
 576                 time_to_send = netem_skb_cb(skb)->time_to_send;
 577                 if (time_to_send <= psched_get_time()) {
 578                         rb_erase(p, &q->t_root);
 579
 580                         sch->q.qlen--;
 581                         skb->next = NULL;
 582                         skb->prev = NULL;
 583                         skb->tstamp = netem_skb_cb(skb)->tstamp_save;
 584
 585 #ifdef CONFIG_NET_CLS_ACT
 586                         /*
 587                          * If it's at ingress let's pretend the delay is
 588                          * from the network (tstamp will be updated).
 589                          */
 590                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 591                                 skb->tstamp.tv64 = 0;
 592 #endif
 593
 594                         if (q->qdisc) {
 595                                 int err = qdisc_enqueue(skb, q->qdisc);
 596
 597                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 598                                         if (net_xmit_drop_count(err)) {
 599                                                 sch->qstats.drops++;
 600                                                 qdisc_tree_decrease_qlen(sch, 1);
 601                                         }
 602                                 }
 603                                 goto tfifo_dequeue;
 604                         }
 605                         goto deliver;
 606                 }
 607
 608                 if (q->qdisc) {
 609                         skb = q->qdisc->ops->dequeue(q->qdisc);
 610                         if (skb)
 611                                 goto deliver;
 612                 }
 613                 qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 614         }
 615
 616         if (q->qdisc) {
 617                 skb = q->qdisc->ops->dequeue(q->qdisc);
 618                 if (skb)
 619                         goto deliver;
 620         }
 621         return NULL;
 622 }
 623
 624 static void netem_reset(struct Qdisc *sch)
 625 {
 626         struct netem_sched_data *q = qdisc_priv(sch);
 627
 628         qdisc_reset_queue(sch);
 629         tfifo_reset(sch);
 630         if (q->qdisc)
 631                 qdisc_reset(q->qdisc);
 632         qdisc_watchdog_cancel(&q->watchdog);
 633 }
 634
 635 static void dist_free(struct disttable *d)
 636 {
 637         if (d) {
 638                 if (is_vmalloc_addr(d))
 639                         vfree(d);
 640                 else
 641                         kfree(d);
 642         }
 643 }
 644
 645 /*
 646  * Distribution data is a variable size payload containing
 647  * signed 16 bit values.
 648  */
 649 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 650 {
 651         struct netem_sched_data *q = qdisc_priv(sch);
 652         size_t n = nla_len(attr)/sizeof(__s16);
 653         const __s16 *data = nla_data(attr);
 654         spinlock_t *root_lock;
 655         struct disttable *d;
 656         int i;
 657         size_t s;
 658
 659         if (n > NETEM_DIST_MAX)
 660                 return -EINVAL;
 661
 662         s = sizeof(struct disttable) + n * sizeof(s16);
 663         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 664         if (!d)
 665                 d = vmalloc(s);
 666         if (!d)
 667                 return -ENOMEM;
 668
 669         d->size = n;
 670         for (i = 0; i < n; i++)
 671                 d->table[i] = data[i];
 672
 673         root_lock = qdisc_root_sleeping_lock(sch);
 674
 675         spin_lock_bh(root_lock);
 676         swap(q->delay_dist, d);
 677         spin_unlock_bh(root_lock);
 678
 679         dist_free(d);
 680         return 0;
 681 }
 682
 683 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 684 {
 685         struct netem_sched_data *q = qdisc_priv(sch);
 686         const struct tc_netem_corr *c = nla_data(attr);
 687
 688         init_crandom(&q->delay_cor, c->delay_corr);
 689         init_crandom(&q->loss_cor, c->loss_corr);
 690         init_crandom(&q->dup_cor, c->dup_corr);
 691 }
 692
 693 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 694 {
 695         struct netem_sched_data *q = qdisc_priv(sch);
 696         const struct tc_netem_reorder *r = nla_data(attr);
 697
 698         q->reorder = r->probability;
 699         init_crandom(&q->reorder_cor, r->correlation);
 700 }
 701
 702 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 703 {
 704         struct netem_sched_data *q = qdisc_priv(sch);
 705         const struct tc_netem_corrupt *r = nla_data(attr);
 706
 707         q->corrupt = r->probability;
 708         init_crandom(&q->corrupt_cor, r->correlation);
 709 }
 710
 711 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 712 {
 713         struct netem_sched_data *q = qdisc_priv(sch);
 714         const struct tc_netem_rate *r = nla_data(attr);
 715
 716         q->rate = r->rate;
 717         q->packet_overhead = r->packet_overhead;
 718         q->cell_size = r->cell_size;
 719         if (q->cell_size)
 720                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 721         q->cell_overhead = r->cell_overhead;
 722 }
 723
 724 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 725 {
 726         struct netem_sched_data *q = qdisc_priv(sch);
 727         const struct nlattr *la;
 728         int rem;
 729
 730         nla_for_each_nested(la, attr, rem) {
 731                 u16 type = nla_type(la);
 732
 733                 switch (type) {
 734                 case NETEM_LOSS_GI: {
 735                         const struct tc_netem_gimodel *gi = nla_data(la);
 736
 737                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 738                                 pr_info("netem: incorrect gi model size\n");
 739                                 return -EINVAL;
 740                         }
 741
 742                         q->loss_model = CLG_4_STATES;
 743
 744                         q->clg.state = 1;
 745                         q->clg.a1 = gi->p13;
 746                         q->clg.a2 = gi->p31;
 747                         q->clg.a3 = gi->p32;
 748                         q->clg.a4 = gi->p14;
 749                         q->clg.a5 = gi->p23;
 750                         break;
 751                 }
 752
 753                 case NETEM_LOSS_GE: {
 754                         const struct tc_netem_gemodel *ge = nla_data(la);
 755
 756                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 757                                 pr_info("netem: incorrect ge model size\n");
 758                                 return -EINVAL;
 759                         }
 760
 761                         q->loss_model = CLG_GILB_ELL;
 762                         q->clg.state = 1;
 763                         q->clg.a1 = ge->p;
 764                         q->clg.a2 = ge->r;
 765                         q->clg.a3 = ge->h;
 766                         q->clg.a4 = ge->k1;
 767                         break;
 768                 }
 769
 770                 default:
 771                         pr_info("netem: unknown loss type %u\n", type);
 772                         return -EINVAL;
 773                 }
 774         }
 775
 776         return 0;
 777 }
 778
 779 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 780         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 781         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 782         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 783         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 784         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 785         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 786         [TCA_NETEM_RATE64]      = { .type = NLA_U64 },
 787 };
 788
 789 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 790                       const struct nla_policy *policy, int len)
 791 {
 792         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 793
 794         if (nested_len < 0) {
 795                 pr_info("netem: invalid attributes len %d\n", nested_len);
 796                 return -EINVAL;
 797         }
 798
 799         if (nested_len >= nla_attr_size(0))
 800                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 801                                  nested_len, policy);
 802
 803         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 804         return 0;
 805 }
 806
 807 /* Parse netlink message to set options */
 808 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 809 {
 810         struct netem_sched_data *q = qdisc_priv(sch);
 811         struct nlattr *tb[TCA_NETEM_MAX + 1];
 812         struct tc_netem_qopt *qopt;
 813         int ret;
 814
 815         if (opt == NULL)
 816                 return -EINVAL;
 817
 818         qopt = nla_data(opt);
 819         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 820         if (ret < 0)
 821                 return ret;
 822
 823         sch->limit = qopt->limit;
 824
 825         q->latency = qopt->latency;
 826         q->jitter = qopt->jitter;
 827         q->limit = qopt->limit;
 828         q->gap = qopt->gap;
 829         q->counter = 0;
 830         q->loss = qopt->loss;
 831         q->duplicate = qopt->duplicate;
 832
 833         /* for compatibility with earlier versions.
 834          * if gap is set, need to assume 100% probability
 835          */
 836         if (q->gap)
 837                 q->reorder = ~0;
 838
 839         if (tb[TCA_NETEM_CORR])
 840                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 841
 842         if (tb[TCA_NETEM_DELAY_DIST]) {
 843                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 844                 if (ret)
 845                         return ret;
 846         }
 847
 848         if (tb[TCA_NETEM_REORDER])
 849                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 850
 851         if (tb[TCA_NETEM_CORRUPT])
 852                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 853
 854         if (tb[TCA_NETEM_RATE])
 855                 get_rate(sch, tb[TCA_NETEM_RATE]);
 856
 857         if (tb[TCA_NETEM_RATE64])
 858                 q->rate = max_t(u64, q->rate,
 859                                 nla_get_u64(tb[TCA_NETEM_RATE64]));
 860
 861         if (tb[TCA_NETEM_ECN])
 862                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 863
 864         q->loss_model = CLG_RANDOM;
 865         if (tb[TCA_NETEM_LOSS])
 866                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 867
 868         return ret;
 869 }
 870
 871 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 872 {
 873         struct netem_sched_data *q = qdisc_priv(sch);
 874         int ret;
 875
 876         if (!opt)
 877                 return -EINVAL;
 878
 879         qdisc_watchdog_init(&q->watchdog, sch);
 880
 881         q->loss_model = CLG_RANDOM;
 882         ret = netem_change(sch, opt);
 883         if (ret)
 884                 pr_info("netem: change failed\n");
 885         return ret;
 886 }
 887
 888 static void netem_destroy(struct Qdisc *sch)
 889 {
 890         struct netem_sched_data *q = qdisc_priv(sch);
 891
 892         qdisc_watchdog_cancel(&q->watchdog);
 893         if (q->qdisc)
 894                 qdisc_destroy(q->qdisc);
 895         dist_free(q->delay_dist);
 896 }
 897
 898 static int dump_loss_model(const struct netem_sched_data *q,
 899                            struct sk_buff *skb)
 900 {
 901         struct nlattr *nest;
 902
 903         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 904         if (nest == NULL)
 905                 goto nla_put_failure;
 906
 907         switch (q->loss_model) {
 908         case CLG_RANDOM:
 909                 /* legacy loss model */
 910                 nla_nest_cancel(skb, nest);
 911                 return 0;       /* no data */
 912
 913         case CLG_4_STATES: {
 914                 struct tc_netem_gimodel gi = {
 915                         .p13 = q->clg.a1,
 916                         .p31 = q->clg.a2,
 917                         .p32 = q->clg.a3,
 918                         .p14 = q->clg.a4,
 919                         .p23 = q->clg.a5,
 920                 };
 921
 922                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 923                         goto nla_put_failure;
 924                 break;
 925         }
 926         case CLG_GILB_ELL: {
 927                 struct tc_netem_gemodel ge = {
 928                         .p = q->clg.a1,
 929                         .r = q->clg.a2,
 930                         .h = q->clg.a3,
 931                         .k1 = q->clg.a4,
 932                 };
 933
 934                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 935                         goto nla_put_failure;
 936                 break;
 937         }
 938         }
 939
 940         nla_nest_end(skb, nest);
 941         return 0;
 942
 943 nla_put_failure:
 944         nla_nest_cancel(skb, nest);
 945         return -1;
 946 }
 947
 948 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 949 {
 950         const struct netem_sched_data *q = qdisc_priv(sch);
 951         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 952         struct tc_netem_qopt qopt;
 953         struct tc_netem_corr cor;
 954         struct tc_netem_reorder reorder;
 955         struct tc_netem_corrupt corrupt;
 956         struct tc_netem_rate rate;
 957
 958         qopt.latency = q->latency;
 959         qopt.jitter = q->jitter;
 960         qopt.limit = q->limit;
 961         qopt.loss = q->loss;
 962         qopt.gap = q->gap;
 963         qopt.duplicate = q->duplicate;
 964         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 965                 goto nla_put_failure;
 966
 967         cor.delay_corr = q->delay_cor.rho;
 968         cor.loss_corr = q->loss_cor.rho;
 969         cor.dup_corr = q->dup_cor.rho;
 970         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 971                 goto nla_put_failure;
 972
 973         reorder.probability = q->reorder;
 974         reorder.correlation = q->reorder_cor.rho;
 975         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 976                 goto nla_put_failure;
 977
 978         corrupt.probability = q->corrupt;
 979         corrupt.correlation = q->corrupt_cor.rho;
 980         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 981                 goto nla_put_failure;
 982
 983         if (q->rate >= (1ULL << 32)) {
 984                 if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
 985                         goto nla_put_failure;
 986                 rate.rate = ~0U;
 987         } else {
 988                 rate.rate = q->rate;
 989         }
 990         rate.packet_overhead = q->packet_overhead;
 991         rate.cell_size = q->cell_size;
 992         rate.cell_overhead = q->cell_overhead;
 993         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 994                 goto nla_put_failure;
 995
 996         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 997                 goto nla_put_failure;
 998
 999         if (dump_loss_model(q, skb) != 0)
1000                 goto nla_put_failure;
1001
1002         return nla_nest_end(skb, nla);
1003
1004 nla_put_failure:
1005         nlmsg_trim(skb, nla);
1006         return -1;
1007 }
1008
1009 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1010                           struct sk_buff *skb, struct tcmsg *tcm)
1011 {
1012         struct netem_sched_data *q = qdisc_priv(sch);
1013
1014         if (cl != 1 || !q->qdisc)       /* only one class */
1015                 return -ENOENT;
1016
1017         tcm->tcm_handle |= TC_H_MIN(1);
1018         tcm->tcm_info = q->qdisc->handle;
1019
1020         return 0;
1021 }
1022
1023 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1024                      struct Qdisc **old)
1025 {
1026         struct netem_sched_data *q = qdisc_priv(sch);
1027
1028         sch_tree_lock(sch);
1029         *old = q->qdisc;
1030         q->qdisc = new;
1031         if (*old) {
1032                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1033                 qdisc_reset(*old);
1034         }
1035         sch_tree_unlock(sch);
1036
1037         return 0;
1038 }
1039
1040 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1041 {
1042         struct netem_sched_data *q = qdisc_priv(sch);
1043         return q->qdisc;
1044 }
1045
1046 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1047 {
1048         return 1;
1049 }
1050
1051 static void netem_put(struct Qdisc *sch, unsigned long arg)
1052 {
1053 }
1054
1055 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1056 {
1057         if (!walker->stop) {
1058                 if (walker->count >= walker->skip)
1059                         if (walker->fn(sch, 1, walker) < 0) {
1060                                 walker->stop = 1;
1061                                 return;
1062                         }
1063                 walker->count++;
1064         }
1065 }
1066
1067 static const struct Qdisc_class_ops netem_class_ops = {
1068         .graft          =       netem_graft,
1069         .leaf           =       netem_leaf,
1070         .get            =       netem_get,
1071         .put            =       netem_put,
1072         .walk           =       netem_walk,
1073         .dump           =       netem_dump_class,
1074 };
1075
1076 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1077         .id             =       "netem",
1078         .cl_ops         =       &netem_class_ops,
1079         .priv_size      =       sizeof(struct netem_sched_data),
1080         .enqueue        =       netem_enqueue,
1081         .dequeue        =       netem_dequeue,
1082         .peek           =       qdisc_peek_dequeued,
1083         .drop           =       netem_drop,
1084         .init           =       netem_init,
1085         .reset          =       netem_reset,
1086         .destroy        =       netem_destroy,
1087         .change         =       netem_change,
1088         .dump           =       netem_dump,
1089         .owner          =       THIS_MODULE,
1090 };
1091
1092
1093 static int __init netem_module_init(void)
1094 {
1095         pr_info("netem: version " VERSION "\n");
1096         return register_qdisc(&netem_qdisc_ops);
1097 }
1098 static void __exit netem_module_exit(void)
1099 {
1100         unregister_qdisc(&netem_qdisc_ops);
1101 }
1102 module_init(netem_module_init)
1103 module_exit(netem_module_exit)
1104 MODULE_LICENSE("GPL");