net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/pkt_cls.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto __rcu *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 160                              const struct sk_buff *skb)
 161 {
 162         return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
 163 }
 164
 165 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 166                                  int *qerr)
 167 {
 168         struct sfq_sched_data *q = qdisc_priv(sch);
 169         struct tcf_result res;
 170         struct tcf_proto *fl;
 171         int result;
 172
 173         if (TC_H_MAJ(skb->priority) == sch->handle &&
 174             TC_H_MIN(skb->priority) > 0 &&
 175             TC_H_MIN(skb->priority) <= q->divisor)
 176                 return TC_H_MIN(skb->priority);
 177
 178         fl = rcu_dereference_bh(q->filter_list);
 179         if (!fl)
 180                 return sfq_hash(q, skb) + 1;
 181
 182         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 183         result = tc_classify(skb, fl, &res, false);
 184         if (result >= 0) {
 185 #ifdef CONFIG_NET_CLS_ACT
 186                 switch (result) {
 187                 case TC_ACT_STOLEN:
 188                 case TC_ACT_QUEUED:
 189                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 190                 case TC_ACT_SHOT:
 191                         return 0;
 192                 }
 193 #endif
 194                 if (TC_H_MIN(res.classid) <= q->divisor)
 195                         return TC_H_MIN(res.classid);
 196         }
 197         return 0;
 198 }
 199
 200 /*
 201  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 202  */
 203 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 204 {
 205         sfq_index p, n;
 206         struct sfq_slot *slot = &q->slots[x];
 207         int qlen = slot->qlen;
 208
 209         p = qlen + SFQ_MAX_FLOWS;
 210         n = q->dep[qlen].next;
 211
 212         slot->dep.next = n;
 213         slot->dep.prev = p;
 214
 215         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 216         sfq_dep_head(q, n)->prev = x;
 217 }
 218
 219 #define sfq_unlink(q, x, n, p)                  \
 220         do {                                    \
 221                 n = q->slots[x].dep.next;       \
 222                 p = q->slots[x].dep.prev;       \
 223                 sfq_dep_head(q, p)->next = n;   \
 224                 sfq_dep_head(q, n)->prev = p;   \
 225         } while (0)
 226
 227
 228 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 229 {
 230         sfq_index p, n;
 231         int d;
 232
 233         sfq_unlink(q, x, n, p);
 234
 235         d = q->slots[x].qlen--;
 236         if (n == p && q->cur_depth == d)
 237                 q->cur_depth--;
 238         sfq_link(q, x);
 239 }
 240
 241 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 242 {
 243         sfq_index p, n;
 244         int d;
 245
 246         sfq_unlink(q, x, n, p);
 247
 248         d = ++q->slots[x].qlen;
 249         if (q->cur_depth < d)
 250                 q->cur_depth = d;
 251         sfq_link(q, x);
 252 }
 253
 254 /* helper functions : might be changed when/if skb use a standard list_head */
 255
 256 /* remove one skb from tail of slot queue */
 257 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 258 {
 259         struct sk_buff *skb = slot->skblist_prev;
 260
 261         slot->skblist_prev = skb->prev;
 262         skb->prev->next = (struct sk_buff *)slot;
 263         skb->next = skb->prev = NULL;
 264         return skb;
 265 }
 266
 267 /* remove one skb from head of slot queue */
 268 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 269 {
 270         struct sk_buff *skb = slot->skblist_next;
 271
 272         slot->skblist_next = skb->next;
 273         skb->next->prev = (struct sk_buff *)slot;
 274         skb->next = skb->prev = NULL;
 275         return skb;
 276 }
 277
 278 static inline void slot_queue_init(struct sfq_slot *slot)
 279 {
 280         memset(slot, 0, sizeof(*slot));
 281         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 282 }
 283
 284 /* add skb to slot queue (tail add) */
 285 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 286 {
 287         skb->prev = slot->skblist_prev;
 288         skb->next = (struct sk_buff *)slot;
 289         slot->skblist_prev->next = skb;
 290         slot->skblist_prev = skb;
 291 }
 292
 293 static unsigned int sfq_drop(struct Qdisc *sch)
 294 {
 295         struct sfq_sched_data *q = qdisc_priv(sch);
 296         sfq_index x, d = q->cur_depth;
 297         struct sk_buff *skb;
 298         unsigned int len;
 299         struct sfq_slot *slot;
 300
 301         /* Queue is full! Find the longest slot and drop tail packet from it */
 302         if (d > 1) {
 303                 x = q->dep[d].next;
 304                 slot = &q->slots[x];
 305 drop:
 306                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 307                 len = qdisc_pkt_len(skb);
 308                 slot->backlog -= len;
 309                 sfq_dec(q, x);
 310                 sch->q.qlen--;
 311                 qdisc_qstats_drop(sch);
 312                 qdisc_qstats_backlog_dec(sch, skb);
 313                 kfree_skb(skb);
 314                 return len;
 315         }
 316
 317         if (d == 1) {
 318                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 319                 x = q->tail->next;
 320                 slot = &q->slots[x];
 321                 q->tail->next = slot->next;
 322                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 323                 goto drop;
 324         }
 325
 326         return 0;
 327 }
 328
 329 /* Is ECN parameter configured */
 330 static int sfq_prob_mark(const struct sfq_sched_data *q)
 331 {
 332         return q->flags & TC_RED_ECN;
 333 }
 334
 335 /* Should packets over max threshold just be marked */
 336 static int sfq_hard_mark(const struct sfq_sched_data *q)
 337 {
 338         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 339 }
 340
 341 static int sfq_headdrop(const struct sfq_sched_data *q)
 342 {
 343         return q->headdrop;
 344 }
 345
 346 static int
 347 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
 348 {
 349         struct sfq_sched_data *q = qdisc_priv(sch);
 350         unsigned int hash, dropped;
 351         sfq_index x, qlen;
 352         struct sfq_slot *slot;
 353         int uninitialized_var(ret);
 354         struct sk_buff *head;
 355         int delta;
 356
 357         hash = sfq_classify(skb, sch, &ret);
 358         if (hash == 0) {
 359                 if (ret & __NET_XMIT_BYPASS)
 360                         qdisc_qstats_drop(sch);
 361                 kfree_skb(skb);
 362                 return ret;
 363         }
 364         hash--;
 365
 366         x = q->ht[hash];
 367         slot = &q->slots[x];
 368         if (x == SFQ_EMPTY_SLOT) {
 369                 x = q->dep[0].next; /* get a free slot */
 370                 if (x >= SFQ_MAX_FLOWS)
 371                         return qdisc_drop(skb, sch, to_free);
 372                 q->ht[hash] = x;
 373                 slot = &q->slots[x];
 374                 slot->hash = hash;
 375                 slot->backlog = 0; /* should already be 0 anyway... */
 376                 red_set_vars(&slot->vars);
 377                 goto enqueue;
 378         }
 379         if (q->red_parms) {
 380                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 381                                                         &slot->vars,
 382                                                         slot->backlog);
 383                 switch (red_action(q->red_parms,
 384                                    &slot->vars,
 385                                    slot->vars.qavg)) {
 386                 case RED_DONT_MARK:
 387                         break;
 388
 389                 case RED_PROB_MARK:
 390                         qdisc_qstats_overlimit(sch);
 391                         if (sfq_prob_mark(q)) {
 392                                 /* We know we have at least one packet in queue */
 393                                 if (sfq_headdrop(q) &&
 394                                     INET_ECN_set_ce(slot->skblist_next)) {
 395                                         q->stats.prob_mark_head++;
 396                                         break;
 397                                 }
 398                                 if (INET_ECN_set_ce(skb)) {
 399                                         q->stats.prob_mark++;
 400                                         break;
 401                                 }
 402                         }
 403                         q->stats.prob_drop++;
 404                         goto congestion_drop;
 405
 406                 case RED_HARD_MARK:
 407                         qdisc_qstats_overlimit(sch);
 408                         if (sfq_hard_mark(q)) {
 409                                 /* We know we have at least one packet in queue */
 410                                 if (sfq_headdrop(q) &&
 411                                     INET_ECN_set_ce(slot->skblist_next)) {
 412                                         q->stats.forced_mark_head++;
 413                                         break;
 414                                 }
 415                                 if (INET_ECN_set_ce(skb)) {
 416                                         q->stats.forced_mark++;
 417                                         break;
 418                                 }
 419                         }
 420                         q->stats.forced_drop++;
 421                         goto congestion_drop;
 422                 }
 423         }
 424
 425         if (slot->qlen >= q->maxdepth) {
 426 congestion_drop:
 427                 if (!sfq_headdrop(q))
 428                         return qdisc_drop(skb, sch, to_free);
 429
 430                 /* We know we have at least one packet in queue */
 431                 head = slot_dequeue_head(slot);
 432                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 433                 sch->qstats.backlog -= delta;
 434                 slot->backlog -= delta;
 435                 qdisc_drop(head, sch, to_free);
 436
 437                 slot_queue_add(slot, skb);
 438                 return NET_XMIT_CN;
 439         }
 440
 441 enqueue:
 442         qdisc_qstats_backlog_inc(sch, skb);
 443         slot->backlog += qdisc_pkt_len(skb);
 444         slot_queue_add(slot, skb);
 445         sfq_inc(q, x);
 446         if (slot->qlen == 1) {          /* The flow is new */
 447                 if (q->tail == NULL) {  /* It is the first flow */
 448                         slot->next = x;
 449                 } else {
 450                         slot->next = q->tail->next;
 451                         q->tail->next = x;
 452                 }
 453                 /* We put this flow at the end of our flow list.
 454                  * This might sound unfair for a new flow to wait after old ones,
 455                  * but we could endup servicing new flows only, and freeze old ones.
 456                  */
 457                 q->tail = slot;
 458                 /* We could use a bigger initial quantum for new flows */
 459                 slot->allot = q->scaled_quantum;
 460         }
 461         if (++sch->q.qlen <= q->limit)
 462                 return NET_XMIT_SUCCESS;
 463
 464         qlen = slot->qlen;
 465         dropped = sfq_drop(sch);
 466         /* Return Congestion Notification only if we dropped a packet
 467          * from this flow.
 468          */
 469         if (qlen != slot->qlen)
 470                 return NET_XMIT_CN;
 471
 472         /* As we dropped a packet, better let upper stack know this */
 473         qdisc_tree_reduce_backlog(sch, 1, dropped);
 474         return NET_XMIT_SUCCESS;
 475 }
 476
 477 static struct sk_buff *
 478 sfq_dequeue(struct Qdisc *sch)
 479 {
 480         struct sfq_sched_data *q = qdisc_priv(sch);
 481         struct sk_buff *skb;
 482         sfq_index a, next_a;
 483         struct sfq_slot *slot;
 484
 485         /* No active slots */
 486         if (q->tail == NULL)
 487                 return NULL;
 488
 489 next_slot:
 490         a = q->tail->next;
 491         slot = &q->slots[a];
 492         if (slot->allot <= 0) {
 493                 q->tail = slot;
 494                 slot->allot += q->scaled_quantum;
 495                 goto next_slot;
 496         }
 497         skb = slot_dequeue_head(slot);
 498         sfq_dec(q, a);
 499         qdisc_bstats_update(sch, skb);
 500         sch->q.qlen--;
 501         qdisc_qstats_backlog_dec(sch, skb);
 502         slot->backlog -= qdisc_pkt_len(skb);
 503         /* Is the slot empty? */
 504         if (slot->qlen == 0) {
 505                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 506                 next_a = slot->next;
 507                 if (a == next_a) {
 508                         q->tail = NULL; /* no more active slots */
 509                         return skb;
 510                 }
 511                 q->tail->next = next_a;
 512         } else {
 513                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 514         }
 515         return skb;
 516 }
 517
 518 static void
 519 sfq_reset(struct Qdisc *sch)
 520 {
 521         struct sk_buff *skb;
 522
 523         while ((skb = sfq_dequeue(sch)) != NULL)
 524                 rtnl_kfree_skbs(skb, skb);
 525 }
 526
 527 /*
 528  * When q->perturbation is changed, we rehash all queued skbs
 529  * to avoid OOO (Out Of Order) effects.
 530  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 531  * counters.
 532  */
 533 static void sfq_rehash(struct Qdisc *sch)
 534 {
 535         struct sfq_sched_data *q = qdisc_priv(sch);
 536         struct sk_buff *skb;
 537         int i;
 538         struct sfq_slot *slot;
 539         struct sk_buff_head list;
 540         int dropped = 0;
 541         unsigned int drop_len = 0;
 542
 543         __skb_queue_head_init(&list);
 544
 545         for (i = 0; i < q->maxflows; i++) {
 546                 slot = &q->slots[i];
 547                 if (!slot->qlen)
 548                         continue;
 549                 while (slot->qlen) {
 550                         skb = slot_dequeue_head(slot);
 551                         sfq_dec(q, i);
 552                         __skb_queue_tail(&list, skb);
 553                 }
 554                 slot->backlog = 0;
 555                 red_set_vars(&slot->vars);
 556                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 557         }
 558         q->tail = NULL;
 559
 560         while ((skb = __skb_dequeue(&list)) != NULL) {
 561                 unsigned int hash = sfq_hash(q, skb);
 562                 sfq_index x = q->ht[hash];
 563
 564                 slot = &q->slots[x];
 565                 if (x == SFQ_EMPTY_SLOT) {
 566                         x = q->dep[0].next; /* get a free slot */
 567                         if (x >= SFQ_MAX_FLOWS) {
 568 drop:
 569                                 qdisc_qstats_backlog_dec(sch, skb);
 570                                 drop_len += qdisc_pkt_len(skb);
 571                                 kfree_skb(skb);
 572                                 dropped++;
 573                                 continue;
 574                         }
 575                         q->ht[hash] = x;
 576                         slot = &q->slots[x];
 577                         slot->hash = hash;
 578                 }
 579                 if (slot->qlen >= q->maxdepth)
 580                         goto drop;
 581                 slot_queue_add(slot, skb);
 582                 if (q->red_parms)
 583                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 584                                                         &slot->vars,
 585                                                         slot->backlog);
 586                 slot->backlog += qdisc_pkt_len(skb);
 587                 sfq_inc(q, x);
 588                 if (slot->qlen == 1) {          /* The flow is new */
 589                         if (q->tail == NULL) {  /* It is the first flow */
 590                                 slot->next = x;
 591                         } else {
 592                                 slot->next = q->tail->next;
 593                                 q->tail->next = x;
 594                         }
 595                         q->tail = slot;
 596                         slot->allot = q->scaled_quantum;
 597                 }
 598         }
 599         sch->q.qlen -= dropped;
 600         qdisc_tree_reduce_backlog(sch, dropped, drop_len);
 601 }
 602
 603 static void sfq_perturbation(unsigned long arg)
 604 {
 605         struct Qdisc *sch = (struct Qdisc *)arg;
 606         struct sfq_sched_data *q = qdisc_priv(sch);
 607         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 608
 609         spin_lock(root_lock);
 610         q->perturbation = prandom_u32();
 611         if (!q->filter_list && q->tail)
 612                 sfq_rehash(sch);
 613         spin_unlock(root_lock);
 614
 615         if (q->perturb_period)
 616                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 617 }
 618
 619 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 620 {
 621         struct sfq_sched_data *q = qdisc_priv(sch);
 622         struct tc_sfq_qopt *ctl = nla_data(opt);
 623         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 624         unsigned int qlen, dropped = 0;
 625         struct red_parms *p = NULL;
 626
 627         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 628                 return -EINVAL;
 629         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 630                 ctl_v1 = nla_data(opt);
 631         if (ctl->divisor &&
 632             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 633                 return -EINVAL;
 634         if (ctl_v1 && ctl_v1->qth_min) {
 635                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 636                 if (!p)
 637                         return -ENOMEM;
 638         }
 639         sch_tree_lock(sch);
 640         if (ctl->quantum) {
 641                 q->quantum = ctl->quantum;
 642                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 643         }
 644         q->perturb_period = ctl->perturb_period * HZ;
 645         if (ctl->flows)
 646                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 647         if (ctl->divisor) {
 648                 q->divisor = ctl->divisor;
 649                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 650         }
 651         if (ctl_v1) {
 652                 if (ctl_v1->depth)
 653                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 654                 if (p) {
 655                         swap(q->red_parms, p);
 656                         red_set_parms(q->red_parms,
 657                                       ctl_v1->qth_min, ctl_v1->qth_max,
 658                                       ctl_v1->Wlog,
 659                                       ctl_v1->Plog, ctl_v1->Scell_log,
 660                                       NULL,
 661                                       ctl_v1->max_P);
 662                 }
 663                 q->flags = ctl_v1->flags;
 664                 q->headdrop = ctl_v1->headdrop;
 665         }
 666         if (ctl->limit) {
 667                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 668                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 669         }
 670
 671         qlen = sch->q.qlen;
 672         while (sch->q.qlen > q->limit)
 673                 dropped += sfq_drop(sch);
 674         qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
 675
 676         del_timer(&q->perturb_timer);
 677         if (q->perturb_period) {
 678                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 679                 q->perturbation = prandom_u32();
 680         }
 681         sch_tree_unlock(sch);
 682         kfree(p);
 683         return 0;
 684 }
 685
 686 static void *sfq_alloc(size_t sz)
 687 {
 688         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 689
 690         if (!ptr)
 691                 ptr = vmalloc(sz);
 692         return ptr;
 693 }
 694
 695 static void sfq_free(void *addr)
 696 {
 697         kvfree(addr);
 698 }
 699
 700 static void sfq_destroy(struct Qdisc *sch)
 701 {
 702         struct sfq_sched_data *q = qdisc_priv(sch);
 703
 704         tcf_destroy_chain(&q->filter_list);
 705         q->perturb_period = 0;
 706         del_timer_sync(&q->perturb_timer);
 707         sfq_free(q->ht);
 708         sfq_free(q->slots);
 709         kfree(q->red_parms);
 710 }
 711
 712 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 713 {
 714         struct sfq_sched_data *q = qdisc_priv(sch);
 715         int i;
 716
 717         setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
 718                                (unsigned long)sch);
 719
 720         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 721                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 722                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 723         }
 724
 725         q->limit = SFQ_MAX_DEPTH;
 726         q->maxdepth = SFQ_MAX_DEPTH;
 727         q->cur_depth = 0;
 728         q->tail = NULL;
 729         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 730         q->maxflows = SFQ_DEFAULT_FLOWS;
 731         q->quantum = psched_mtu(qdisc_dev(sch));
 732         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 733         q->perturb_period = 0;
 734         q->perturbation = prandom_u32();
 735
 736         if (opt) {
 737                 int err = sfq_change(sch, opt);
 738                 if (err)
 739                         return err;
 740         }
 741
 742         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 743         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 744         if (!q->ht || !q->slots) {
 745                 /* Note: sfq_destroy() will be called by our caller */
 746                 return -ENOMEM;
 747         }
 748
 749         for (i = 0; i < q->divisor; i++)
 750                 q->ht[i] = SFQ_EMPTY_SLOT;
 751
 752         for (i = 0; i < q->maxflows; i++) {
 753                 slot_queue_init(&q->slots[i]);
 754                 sfq_link(q, i);
 755         }
 756         if (q->limit >= 1)
 757                 sch->flags |= TCQ_F_CAN_BYPASS;
 758         else
 759                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 760         return 0;
 761 }
 762
 763 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 764 {
 765         struct sfq_sched_data *q = qdisc_priv(sch);
 766         unsigned char *b = skb_tail_pointer(skb);
 767         struct tc_sfq_qopt_v1 opt;
 768         struct red_parms *p = q->red_parms;
 769
 770         memset(&opt, 0, sizeof(opt));
 771         opt.v0.quantum  = q->quantum;
 772         opt.v0.perturb_period = q->perturb_period / HZ;
 773         opt.v0.limit    = q->limit;
 774         opt.v0.divisor  = q->divisor;
 775         opt.v0.flows    = q->maxflows;
 776         opt.depth       = q->maxdepth;
 777         opt.headdrop    = q->headdrop;
 778
 779         if (p) {
 780                 opt.qth_min     = p->qth_min >> p->Wlog;
 781                 opt.qth_max     = p->qth_max >> p->Wlog;
 782                 opt.Wlog        = p->Wlog;
 783                 opt.Plog        = p->Plog;
 784                 opt.Scell_log   = p->Scell_log;
 785                 opt.max_P       = p->max_P;
 786         }
 787         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 788         opt.flags       = q->flags;
 789
 790         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 791                 goto nla_put_failure;
 792
 793         return skb->len;
 794
 795 nla_put_failure:
 796         nlmsg_trim(skb, b);
 797         return -1;
 798 }
 799
 800 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 801 {
 802         return NULL;
 803 }
 804
 805 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 806 {
 807         return 0;
 808 }
 809
 810 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 811                               u32 classid)
 812 {
 813         /* we cannot bypass queue discipline anymore */
 814         sch->flags &= ~TCQ_F_CAN_BYPASS;
 815         return 0;
 816 }
 817
 818 static void sfq_put(struct Qdisc *q, unsigned long cl)
 819 {
 820 }
 821
 822 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
 823                                              unsigned long cl)
 824 {
 825         struct sfq_sched_data *q = qdisc_priv(sch);
 826
 827         if (cl)
 828                 return NULL;
 829         return &q->filter_list;
 830 }
 831
 832 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 833                           struct sk_buff *skb, struct tcmsg *tcm)
 834 {
 835         tcm->tcm_handle |= TC_H_MIN(cl);
 836         return 0;
 837 }
 838
 839 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 840                                 struct gnet_dump *d)
 841 {
 842         struct sfq_sched_data *q = qdisc_priv(sch);
 843         sfq_index idx = q->ht[cl - 1];
 844         struct gnet_stats_queue qs = { 0 };
 845         struct tc_sfq_xstats xstats = { 0 };
 846
 847         if (idx != SFQ_EMPTY_SLOT) {
 848                 const struct sfq_slot *slot = &q->slots[idx];
 849
 850                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 851                 qs.qlen = slot->qlen;
 852                 qs.backlog = slot->backlog;
 853         }
 854         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
 855                 return -1;
 856         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 857 }
 858
 859 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 860 {
 861         struct sfq_sched_data *q = qdisc_priv(sch);
 862         unsigned int i;
 863
 864         if (arg->stop)
 865                 return;
 866
 867         for (i = 0; i < q->divisor; i++) {
 868                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 869                     arg->count < arg->skip) {
 870                         arg->count++;
 871                         continue;
 872                 }
 873                 if (arg->fn(sch, i + 1, arg) < 0) {
 874                         arg->stop = 1;
 875                         break;
 876                 }
 877                 arg->count++;
 878         }
 879 }
 880
 881 static const struct Qdisc_class_ops sfq_class_ops = {
 882         .leaf           =       sfq_leaf,
 883         .get            =       sfq_get,
 884         .put            =       sfq_put,
 885         .tcf_chain      =       sfq_find_tcf,
 886         .bind_tcf       =       sfq_bind,
 887         .unbind_tcf     =       sfq_put,
 888         .dump           =       sfq_dump_class,
 889         .dump_stats     =       sfq_dump_class_stats,
 890         .walk           =       sfq_walk,
 891 };
 892
 893 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 894         .cl_ops         =       &sfq_class_ops,
 895         .id             =       "sfq",
 896         .priv_size      =       sizeof(struct sfq_sched_data),
 897         .enqueue        =       sfq_enqueue,
 898         .dequeue        =       sfq_dequeue,
 899         .peek           =       qdisc_peek_dequeued,
 900         .init           =       sfq_init,
 901         .reset          =       sfq_reset,
 902         .destroy        =       sfq_destroy,
 903         .change         =       NULL,
 904         .dump           =       sfq_dump,
 905         .owner          =       THIS_MODULE,
 906 };
 907
 908 static int __init sfq_module_init(void)
 909 {
 910         return register_qdisc(&sfq_qdisc_ops);
 911 }
 912 static void __exit sfq_module_exit(void)
 913 {
 914         unregister_qdisc(&sfq_qdisc_ops);
 915 }
 916 module_init(sfq_module_init)
 917 module_exit(sfq_module_exit)
 918 MODULE_LICENSE("GPL");