block/blk-throttle.c

   1 /*
   2  * Interface for controlling IO bandwidth on a request queue
   3  *
   4  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   5  */
   6
   7 #include <linux/module.h>
   8 #include <linux/slab.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blktrace_api.h>
  12 #include "blk-cgroup.h"
  13 #include "blk.h"
  14
  15 /* Max dispatch from a group in 1 round */
  16 static int throtl_grp_quantum = 8;
  17
  18 /* Total max dispatch from all groups in one round */
  19 static int throtl_quantum = 32;
  20
  21 /* Throttling is performed over 100ms slice and after that slice is renewed */
  22 static unsigned long throtl_slice = HZ/10;      /* 100 ms */
  23
  24 static struct blkio_policy_type blkio_policy_throtl;
  25
  26 /* A workqueue to queue throttle related work */
  27 static struct workqueue_struct *kthrotld_workqueue;
  28 static void throtl_schedule_delayed_work(struct throtl_data *td,
  29                                 unsigned long delay);
  30
  31 struct throtl_rb_root {
  32         struct rb_root rb;
  33         struct rb_node *left;
  34         unsigned int count;
  35         unsigned long min_disptime;
  36 };
  37
  38 #define THROTL_RB_ROOT  (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
  39                         .count = 0, .min_disptime = 0}
  40
  41 #define rb_entry_tg(node)       rb_entry((node), struct throtl_grp, rb_node)
  42
  43 struct throtl_grp {
  44         /* List of throtl groups on the request queue*/
  45         struct hlist_node tg_node;
  46
  47         /* active throtl group service_tree member */
  48         struct rb_node rb_node;
  49
  50         /*
  51          * Dispatch time in jiffies. This is the estimated time when group
  52          * will unthrottle and is ready to dispatch more bio. It is used as
  53          * key to sort active groups in service tree.
  54          */
  55         unsigned long disptime;
  56
  57         unsigned int flags;
  58
  59         /* Two lists for READ and WRITE */
  60         struct bio_list bio_lists[2];
  61
  62         /* Number of queued bios on READ and WRITE lists */
  63         unsigned int nr_queued[2];
  64
  65         /* bytes per second rate limits */
  66         uint64_t bps[2];
  67
  68         /* IOPS limits */
  69         unsigned int iops[2];
  70
  71         /* Number of bytes disptached in current slice */
  72         uint64_t bytes_disp[2];
  73         /* Number of bio's dispatched in current slice */
  74         unsigned int io_disp[2];
  75
  76         /* When did we start a new slice */
  77         unsigned long slice_start[2];
  78         unsigned long slice_end[2];
  79
  80         /* Some throttle limits got updated for the group */
  81         int limits_changed;
  82 };
  83
  84 struct throtl_data
  85 {
  86         /* List of throtl groups */
  87         struct hlist_head tg_list;
  88
  89         /* service tree for active throtl groups */
  90         struct throtl_rb_root tg_service_tree;
  91
  92         struct throtl_grp *root_tg;
  93         struct request_queue *queue;
  94
  95         /* Total Number of queued bios on READ and WRITE lists */
  96         unsigned int nr_queued[2];
  97
  98         /*
  99          * number of total undestroyed groups
 100          */
 101         unsigned int nr_undestroyed_grps;
 102
 103         /* Work for dispatching throttled bios */
 104         struct delayed_work throtl_work;
 105
 106         int limits_changed;
 107 };
 108
 109 static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 110 {
 111         return blkg_to_pdata(blkg, &blkio_policy_throtl);
 112 }
 113
 114 static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
 115 {
 116         return pdata_to_blkg(tg, &blkio_policy_throtl);
 117 }
 118
 119 enum tg_state_flags {
 120         THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
 121 };
 122
 123 #define THROTL_TG_FNS(name)                                             \
 124 static inline void throtl_mark_tg_##name(struct throtl_grp *tg)         \
 125 {                                                                       \
 126         (tg)->flags |= (1 << THROTL_TG_FLAG_##name);                    \
 127 }                                                                       \
 128 static inline void throtl_clear_tg_##name(struct throtl_grp *tg)        \
 129 {                                                                       \
 130         (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);                   \
 131 }                                                                       \
 132 static inline int throtl_tg_##name(const struct throtl_grp *tg)         \
 133 {                                                                       \
 134         return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;       \
 135 }
 136
 137 THROTL_TG_FNS(on_rr);
 138
 139 #define throtl_log_tg(td, tg, fmt, args...)                             \
 140         blk_add_trace_msg((td)->queue, "throtl %s " fmt,                \
 141                           blkg_path(tg_to_blkg(tg)), ##args);           \
 142
 143 #define throtl_log(td, fmt, args...)    \
 144         blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 145
 146 static inline unsigned int total_nr_queued(struct throtl_data *td)
 147 {
 148         return td->nr_queued[0] + td->nr_queued[1];
 149 }
 150
 151 static void throtl_init_blkio_group(struct blkio_group *blkg)
 152 {
 153         struct throtl_grp *tg = blkg_to_tg(blkg);
 154
 155         INIT_HLIST_NODE(&tg->tg_node);
 156         RB_CLEAR_NODE(&tg->rb_node);
 157         bio_list_init(&tg->bio_lists[0]);
 158         bio_list_init(&tg->bio_lists[1]);
 159         tg->limits_changed = false;
 160
 161         tg->bps[READ] = -1;
 162         tg->bps[WRITE] = -1;
 163         tg->iops[READ] = -1;
 164         tg->iops[WRITE] = -1;
 165 }
 166
 167 static void throtl_link_blkio_group(struct request_queue *q,
 168                                     struct blkio_group *blkg)
 169 {
 170         struct throtl_data *td = q->td;
 171         struct throtl_grp *tg = blkg_to_tg(blkg);
 172
 173         hlist_add_head(&tg->tg_node, &td->tg_list);
 174         td->nr_undestroyed_grps++;
 175 }
 176
 177 static struct
 178 throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 179 {
 180         /*
 181          * This is the common case when there are no blkio cgroups.
 182          * Avoid lookup in this case
 183          */
 184         if (blkcg == &blkio_root_cgroup)
 185                 return td->root_tg;
 186
 187         return blkg_to_tg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
 188 }
 189
 190 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 191                                                   struct blkio_cgroup *blkcg)
 192 {
 193         struct request_queue *q = td->queue;
 194         struct throtl_grp *tg = NULL;
 195
 196         /*
 197          * This is the common case when there are no blkio cgroups.
 198          * Avoid lookup in this case
 199          */
 200         if (blkcg == &blkio_root_cgroup) {
 201                 tg = td->root_tg;
 202         } else {
 203                 struct blkio_group *blkg;
 204
 205                 blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false);
 206
 207                 /* if %NULL and @q is alive, fall back to root_tg */
 208                 if (!IS_ERR(blkg))
 209                         tg = blkg_to_tg(blkg);
 210                 else if (!blk_queue_dead(q))
 211                         tg = td->root_tg;
 212         }
 213
 214         return tg;
 215 }
 216
 217 static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
 218 {
 219         /* Service tree is empty */
 220         if (!root->count)
 221                 return NULL;
 222
 223         if (!root->left)
 224                 root->left = rb_first(&root->rb);
 225
 226         if (root->left)
 227                 return rb_entry_tg(root->left);
 228
 229         return NULL;
 230 }
 231
 232 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 233 {
 234         rb_erase(n, root);
 235         RB_CLEAR_NODE(n);
 236 }
 237
 238 static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
 239 {
 240         if (root->left == n)
 241                 root->left = NULL;
 242         rb_erase_init(n, &root->rb);
 243         --root->count;
 244 }
 245
 246 static void update_min_dispatch_time(struct throtl_rb_root *st)
 247 {
 248         struct throtl_grp *tg;
 249
 250         tg = throtl_rb_first(st);
 251         if (!tg)
 252                 return;
 253
 254         st->min_disptime = tg->disptime;
 255 }
 256
 257 static void
 258 tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
 259 {
 260         struct rb_node **node = &st->rb.rb_node;
 261         struct rb_node *parent = NULL;
 262         struct throtl_grp *__tg;
 263         unsigned long key = tg->disptime;
 264         int left = 1;
 265
 266         while (*node != NULL) {
 267                 parent = *node;
 268                 __tg = rb_entry_tg(parent);
 269
 270                 if (time_before(key, __tg->disptime))
 271                         node = &parent->rb_left;
 272                 else {
 273                         node = &parent->rb_right;
 274                         left = 0;
 275                 }
 276         }
 277
 278         if (left)
 279                 st->left = &tg->rb_node;
 280
 281         rb_link_node(&tg->rb_node, parent, node);
 282         rb_insert_color(&tg->rb_node, &st->rb);
 283 }
 284
 285 static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 286 {
 287         struct throtl_rb_root *st = &td->tg_service_tree;
 288
 289         tg_service_tree_add(st, tg);
 290         throtl_mark_tg_on_rr(tg);
 291         st->count++;
 292 }
 293
 294 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 295 {
 296         if (!throtl_tg_on_rr(tg))
 297                 __throtl_enqueue_tg(td, tg);
 298 }
 299
 300 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 301 {
 302         throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
 303         throtl_clear_tg_on_rr(tg);
 304 }
 305
 306 static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 307 {
 308         if (throtl_tg_on_rr(tg))
 309                 __throtl_dequeue_tg(td, tg);
 310 }
 311
 312 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 313 {
 314         struct throtl_rb_root *st = &td->tg_service_tree;
 315
 316         /*
 317          * If there are more bios pending, schedule more work.
 318          */
 319         if (!total_nr_queued(td))
 320                 return;
 321
 322         BUG_ON(!st->count);
 323
 324         update_min_dispatch_time(st);
 325
 326         if (time_before_eq(st->min_disptime, jiffies))
 327                 throtl_schedule_delayed_work(td, 0);
 328         else
 329                 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
 330 }
 331
 332 static inline void
 333 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 334 {
 335         tg->bytes_disp[rw] = 0;
 336         tg->io_disp[rw] = 0;
 337         tg->slice_start[rw] = jiffies;
 338         tg->slice_end[rw] = jiffies + throtl_slice;
 339         throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
 340                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
 341                         tg->slice_end[rw], jiffies);
 342 }
 343
 344 static inline void throtl_set_slice_end(struct throtl_data *td,
 345                 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 346 {
 347         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 348 }
 349
 350 static inline void throtl_extend_slice(struct throtl_data *td,
 351                 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 352 {
 353         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 354         throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 355                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
 356                         tg->slice_end[rw], jiffies);
 357 }
 358
 359 /* Determine if previously allocated or extended slice is complete or not */
 360 static bool
 361 throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 362 {
 363         if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 364                 return 0;
 365
 366         return 1;
 367 }
 368
 369 /* Trim the used slices and adjust slice start accordingly */
 370 static inline void
 371 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 372 {
 373         unsigned long nr_slices, time_elapsed, io_trim;
 374         u64 bytes_trim, tmp;
 375
 376         BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 377
 378         /*
 379          * If bps are unlimited (-1), then time slice don't get
 380          * renewed. Don't try to trim the slice if slice is used. A new
 381          * slice will start when appropriate.
 382          */
 383         if (throtl_slice_used(td, tg, rw))
 384                 return;
 385
 386         /*
 387          * A bio has been dispatched. Also adjust slice_end. It might happen
 388          * that initially cgroup limit was very low resulting in high
 389          * slice_end, but later limit was bumped up and bio was dispached
 390          * sooner, then we need to reduce slice_end. A high bogus slice_end
 391          * is bad because it does not allow new slice to start.
 392          */
 393
 394         throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
 395
 396         time_elapsed = jiffies - tg->slice_start[rw];
 397
 398         nr_slices = time_elapsed / throtl_slice;
 399
 400         if (!nr_slices)
 401                 return;
 402         tmp = tg->bps[rw] * throtl_slice * nr_slices;
 403         do_div(tmp, HZ);
 404         bytes_trim = tmp;
 405
 406         io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 407
 408         if (!bytes_trim && !io_trim)
 409                 return;
 410
 411         if (tg->bytes_disp[rw] >= bytes_trim)
 412                 tg->bytes_disp[rw] -= bytes_trim;
 413         else
 414                 tg->bytes_disp[rw] = 0;
 415
 416         if (tg->io_disp[rw] >= io_trim)
 417                 tg->io_disp[rw] -= io_trim;
 418         else
 419                 tg->io_disp[rw] = 0;
 420
 421         tg->slice_start[rw] += nr_slices * throtl_slice;
 422
 423         throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 424                         " start=%lu end=%lu jiffies=%lu",
 425                         rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 426                         tg->slice_start[rw], tg->slice_end[rw], jiffies);
 427 }
 428
 429 static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 430                 struct bio *bio, unsigned long *wait)
 431 {
 432         bool rw = bio_data_dir(bio);
 433         unsigned int io_allowed;
 434         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 435         u64 tmp;
 436
 437         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 438
 439         /* Slice has just started. Consider one slice interval */
 440         if (!jiffy_elapsed)
 441                 jiffy_elapsed_rnd = throtl_slice;
 442
 443         jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 444
 445         /*
 446          * jiffy_elapsed_rnd should not be a big value as minimum iops can be
 447          * 1 then at max jiffy elapsed should be equivalent of 1 second as we
 448          * will allow dispatch after 1 second and after that slice should
 449          * have been trimmed.
 450          */
 451
 452         tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
 453         do_div(tmp, HZ);
 454
 455         if (tmp > UINT_MAX)
 456                 io_allowed = UINT_MAX;
 457         else
 458                 io_allowed = tmp;
 459
 460         if (tg->io_disp[rw] + 1 <= io_allowed) {
 461                 if (wait)
 462                         *wait = 0;
 463                 return 1;
 464         }
 465
 466         /* Calc approx time to dispatch */
 467         jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
 468
 469         if (jiffy_wait > jiffy_elapsed)
 470                 jiffy_wait = jiffy_wait - jiffy_elapsed;
 471         else
 472                 jiffy_wait = 1;
 473
 474         if (wait)
 475                 *wait = jiffy_wait;
 476         return 0;
 477 }
 478
 479 static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 480                 struct bio *bio, unsigned long *wait)
 481 {
 482         bool rw = bio_data_dir(bio);
 483         u64 bytes_allowed, extra_bytes, tmp;
 484         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 485
 486         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 487
 488         /* Slice has just started. Consider one slice interval */
 489         if (!jiffy_elapsed)
 490                 jiffy_elapsed_rnd = throtl_slice;
 491
 492         jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 493
 494         tmp = tg->bps[rw] * jiffy_elapsed_rnd;
 495         do_div(tmp, HZ);
 496         bytes_allowed = tmp;
 497
 498         if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 499                 if (wait)
 500                         *wait = 0;
 501                 return 1;
 502         }
 503
 504         /* Calc approx time to dispatch */
 505         extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
 506         jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
 507
 508         if (!jiffy_wait)
 509                 jiffy_wait = 1;
 510
 511         /*
 512          * This wait time is without taking into consideration the rounding
 513          * up we did. Add that time also.
 514          */
 515         jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 516         if (wait)
 517                 *wait = jiffy_wait;
 518         return 0;
 519 }
 520
 521 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
 522         if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
 523                 return 1;
 524         return 0;
 525 }
 526
 527 /*
 528  * Returns whether one can dispatch a bio or not. Also returns approx number
 529  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
 530  */
 531 static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 532                                 struct bio *bio, unsigned long *wait)
 533 {
 534         bool rw = bio_data_dir(bio);
 535         unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 536
 537         /*
 538          * Currently whole state machine of group depends on first bio
 539          * queued in the group bio list. So one should not be calling
 540          * this function with a different bio if there are other bios
 541          * queued.
 542          */
 543         BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 544
 545         /* If tg->bps = -1, then BW is unlimited */
 546         if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
 547                 if (wait)
 548                         *wait = 0;
 549                 return 1;
 550         }
 551
 552         /*
 553          * If previous slice expired, start a new one otherwise renew/extend
 554          * existing slice to make sure it is at least throtl_slice interval
 555          * long since now.
 556          */
 557         if (throtl_slice_used(td, tg, rw))
 558                 throtl_start_new_slice(td, tg, rw);
 559         else {
 560                 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
 561                         throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
 562         }
 563
 564         if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
 565             && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
 566                 if (wait)
 567                         *wait = 0;
 568                 return 1;
 569         }
 570
 571         max_wait = max(bps_wait, iops_wait);
 572
 573         if (wait)
 574                 *wait = max_wait;
 575
 576         if (time_before(tg->slice_end[rw], jiffies + max_wait))
 577                 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 578
 579         return 0;
 580 }
 581
 582 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 583 {
 584         bool rw = bio_data_dir(bio);
 585         bool sync = rw_is_sync(bio->bi_rw);
 586
 587         /* Charge the bio to the group */
 588         tg->bytes_disp[rw] += bio->bi_size;
 589         tg->io_disp[rw]++;
 590
 591         blkiocg_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, rw, sync);
 592 }
 593
 594 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 595                         struct bio *bio)
 596 {
 597         bool rw = bio_data_dir(bio);
 598
 599         bio_list_add(&tg->bio_lists[rw], bio);
 600         /* Take a bio reference on tg */
 601         blkg_get(tg_to_blkg(tg));
 602         tg->nr_queued[rw]++;
 603         td->nr_queued[rw]++;
 604         throtl_enqueue_tg(td, tg);
 605 }
 606
 607 static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 608 {
 609         unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 610         struct bio *bio;
 611
 612         if ((bio = bio_list_peek(&tg->bio_lists[READ])))
 613                 tg_may_dispatch(td, tg, bio, &read_wait);
 614
 615         if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 616                 tg_may_dispatch(td, tg, bio, &write_wait);
 617
 618         min_wait = min(read_wait, write_wait);
 619         disptime = jiffies + min_wait;
 620
 621         /* Update dispatch time */
 622         throtl_dequeue_tg(td, tg);
 623         tg->disptime = disptime;
 624         throtl_enqueue_tg(td, tg);
 625 }
 626
 627 static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 628                                 bool rw, struct bio_list *bl)
 629 {
 630         struct bio *bio;
 631
 632         bio = bio_list_pop(&tg->bio_lists[rw]);
 633         tg->nr_queued[rw]--;
 634         /* Drop bio reference on blkg */
 635         blkg_put(tg_to_blkg(tg));
 636
 637         BUG_ON(td->nr_queued[rw] <= 0);
 638         td->nr_queued[rw]--;
 639
 640         throtl_charge_bio(tg, bio);
 641         bio_list_add(bl, bio);
 642         bio->bi_rw |= REQ_THROTTLED;
 643
 644         throtl_trim_slice(td, tg, rw);
 645 }
 646
 647 static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 648                                 struct bio_list *bl)
 649 {
 650         unsigned int nr_reads = 0, nr_writes = 0;
 651         unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 652         unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
 653         struct bio *bio;
 654
 655         /* Try to dispatch 75% READS and 25% WRITES */
 656
 657         while ((bio = bio_list_peek(&tg->bio_lists[READ]))
 658                 && tg_may_dispatch(td, tg, bio, NULL)) {
 659
 660                 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 661                 nr_reads++;
 662
 663                 if (nr_reads >= max_nr_reads)
 664                         break;
 665         }
 666
 667         while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
 668                 && tg_may_dispatch(td, tg, bio, NULL)) {
 669
 670                 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 671                 nr_writes++;
 672
 673                 if (nr_writes >= max_nr_writes)
 674                         break;
 675         }
 676
 677         return nr_reads + nr_writes;
 678 }
 679
 680 static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 681 {
 682         unsigned int nr_disp = 0;
 683         struct throtl_grp *tg;
 684         struct throtl_rb_root *st = &td->tg_service_tree;
 685
 686         while (1) {
 687                 tg = throtl_rb_first(st);
 688
 689                 if (!tg)
 690                         break;
 691
 692                 if (time_before(jiffies, tg->disptime))
 693                         break;
 694
 695                 throtl_dequeue_tg(td, tg);
 696
 697                 nr_disp += throtl_dispatch_tg(td, tg, bl);
 698
 699                 if (tg->nr_queued[0] || tg->nr_queued[1]) {
 700                         tg_update_disptime(td, tg);
 701                         throtl_enqueue_tg(td, tg);
 702                 }
 703
 704                 if (nr_disp >= throtl_quantum)
 705                         break;
 706         }
 707
 708         return nr_disp;
 709 }
 710
 711 static void throtl_process_limit_change(struct throtl_data *td)
 712 {
 713         struct throtl_grp *tg;
 714         struct hlist_node *pos, *n;
 715
 716         if (!td->limits_changed)
 717                 return;
 718
 719         xchg(&td->limits_changed, false);
 720
 721         throtl_log(td, "limits changed");
 722
 723         hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
 724                 if (!tg->limits_changed)
 725                         continue;
 726
 727                 if (!xchg(&tg->limits_changed, false))
 728                         continue;
 729
 730                 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
 731                         " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
 732                         tg->iops[READ], tg->iops[WRITE]);
 733
 734                 /*
 735                  * Restart the slices for both READ and WRITES. It
 736                  * might happen that a group's limit are dropped
 737                  * suddenly and we don't want to account recently
 738                  * dispatched IO with new low rate
 739                  */
 740                 throtl_start_new_slice(td, tg, 0);
 741                 throtl_start_new_slice(td, tg, 1);
 742
 743                 if (throtl_tg_on_rr(tg))
 744                         tg_update_disptime(td, tg);
 745         }
 746 }
 747
 748 /* Dispatch throttled bios. Should be called without queue lock held. */
 749 static int throtl_dispatch(struct request_queue *q)
 750 {
 751         struct throtl_data *td = q->td;
 752         unsigned int nr_disp = 0;
 753         struct bio_list bio_list_on_stack;
 754         struct bio *bio;
 755         struct blk_plug plug;
 756
 757         spin_lock_irq(q->queue_lock);
 758
 759         throtl_process_limit_change(td);
 760
 761         if (!total_nr_queued(td))
 762                 goto out;
 763
 764         bio_list_init(&bio_list_on_stack);
 765
 766         throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 767                         total_nr_queued(td), td->nr_queued[READ],
 768                         td->nr_queued[WRITE]);
 769
 770         nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
 771
 772         if (nr_disp)
 773                 throtl_log(td, "bios disp=%u", nr_disp);
 774
 775         throtl_schedule_next_dispatch(td);
 776 out:
 777         spin_unlock_irq(q->queue_lock);
 778
 779         /*
 780          * If we dispatched some requests, unplug the queue to make sure
 781          * immediate dispatch
 782          */
 783         if (nr_disp) {
 784                 blk_start_plug(&plug);
 785                 while((bio = bio_list_pop(&bio_list_on_stack)))
 786                         generic_make_request(bio);
 787                 blk_finish_plug(&plug);
 788         }
 789         return nr_disp;
 790 }
 791
 792 void blk_throtl_work(struct work_struct *work)
 793 {
 794         struct throtl_data *td = container_of(work, struct throtl_data,
 795                                         throtl_work.work);
 796         struct request_queue *q = td->queue;
 797
 798         throtl_dispatch(q);
 799 }
 800
 801 /* Call with queue lock held */
 802 static void
 803 throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 804 {
 805
 806         struct delayed_work *dwork = &td->throtl_work;
 807
 808         /* schedule work if limits changed even if no bio is queued */
 809         if (total_nr_queued(td) || td->limits_changed) {
 810                 /*
 811                  * We might have a work scheduled to be executed in future.
 812                  * Cancel that and schedule a new one.
 813                  */
 814                 __cancel_delayed_work(dwork);
 815                 queue_delayed_work(kthrotld_workqueue, dwork, delay);
 816                 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
 817                                 delay, jiffies);
 818         }
 819 }
 820
 821 static void
 822 throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 823 {
 824         /* Something wrong if we are trying to remove same group twice */
 825         BUG_ON(hlist_unhashed(&tg->tg_node));
 826
 827         hlist_del_init(&tg->tg_node);
 828
 829         /*
 830          * Put the reference taken at the time of creation so that when all
 831          * queues are gone, group can be destroyed.
 832          */
 833         blkg_put(tg_to_blkg(tg));
 834         td->nr_undestroyed_grps--;
 835 }
 836
 837 static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 838 {
 839         struct hlist_node *pos, *n;
 840         struct throtl_grp *tg;
 841         bool empty = true;
 842
 843         hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
 844                 /* skip root? */
 845                 if (!release_root && tg == td->root_tg)
 846                         continue;
 847
 848                 /*
 849                  * If cgroup removal path got to blk_group first and removed
 850                  * it from cgroup list, then it will take care of destroying
 851                  * cfqg also.
 852                  */
 853                 if (!blkiocg_del_blkio_group(tg_to_blkg(tg)))
 854                         throtl_destroy_tg(td, tg);
 855                 else
 856                         empty = false;
 857         }
 858         return empty;
 859 }
 860
 861 /*
 862  * Blk cgroup controller notification saying that blkio_group object is being
 863  * delinked as associated cgroup object is going away. That also means that
 864  * no new IO will come in this group. So get rid of this group as soon as
 865  * any pending IO in the group is finished.
 866  *
 867  * This function is called under rcu_read_lock(). @q is the rcu protected
 868  * pointer. That means @q is a valid request_queue pointer as long as we
 869  * are rcu read lock.
 870  *
 871  * @q was fetched from blkio_group under blkio_cgroup->lock. That means
 872  * it should not be NULL as even if queue was going away, cgroup deltion
 873  * path got to it first.
 874  */
 875 void throtl_unlink_blkio_group(struct request_queue *q,
 876                                struct blkio_group *blkg)
 877 {
 878         unsigned long flags;
 879
 880         spin_lock_irqsave(q->queue_lock, flags);
 881         throtl_destroy_tg(q->td, blkg_to_tg(blkg));
 882         spin_unlock_irqrestore(q->queue_lock, flags);
 883 }
 884
 885 static bool throtl_clear_queue(struct request_queue *q)
 886 {
 887         lockdep_assert_held(q->queue_lock);
 888
 889         /*
 890          * Clear tgs but leave the root one alone.  This is necessary
 891          * because root_tg is expected to be persistent and safe because
 892          * blk-throtl can never be disabled while @q is alive.  This is a
 893          * kludge to prepare for unified blkg.  This whole function will be
 894          * removed soon.
 895          */
 896         return throtl_release_tgs(q->td, false);
 897 }
 898
 899 static void throtl_update_blkio_group_common(struct throtl_data *td,
 900                                 struct throtl_grp *tg)
 901 {
 902         xchg(&tg->limits_changed, true);
 903         xchg(&td->limits_changed, true);
 904         /* Schedule a work now to process the limit change */
 905         throtl_schedule_delayed_work(td, 0);
 906 }
 907
 908 /*
 909  * For all update functions, @q should be a valid pointer because these
 910  * update functions are called under blkcg_lock, that means, blkg is
 911  * valid and in turn @q is valid. queue exit path can not race because
 912  * of blkcg_lock
 913  *
 914  * Can not take queue lock in update functions as queue lock under blkcg_lock
 915  * is not allowed. Under other paths we take blkcg_lock under queue_lock.
 916  */
 917 static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 918                                 struct blkio_group *blkg, u64 read_bps)
 919 {
 920         struct throtl_grp *tg = blkg_to_tg(blkg);
 921
 922         tg->bps[READ] = read_bps;
 923         throtl_update_blkio_group_common(q->td, tg);
 924 }
 925
 926 static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 927                                 struct blkio_group *blkg, u64 write_bps)
 928 {
 929         struct throtl_grp *tg = blkg_to_tg(blkg);
 930
 931         tg->bps[WRITE] = write_bps;
 932         throtl_update_blkio_group_common(q->td, tg);
 933 }
 934
 935 static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 936                         struct blkio_group *blkg, unsigned int read_iops)
 937 {
 938         struct throtl_grp *tg = blkg_to_tg(blkg);
 939
 940         tg->iops[READ] = read_iops;
 941         throtl_update_blkio_group_common(q->td, tg);
 942 }
 943
 944 static void throtl_update_blkio_group_write_iops(struct request_queue *q,
 945                         struct blkio_group *blkg, unsigned int write_iops)
 946 {
 947         struct throtl_grp *tg = blkg_to_tg(blkg);
 948
 949         tg->iops[WRITE] = write_iops;
 950         throtl_update_blkio_group_common(q->td, tg);
 951 }
 952
 953 static void throtl_shutdown_wq(struct request_queue *q)
 954 {
 955         struct throtl_data *td = q->td;
 956
 957         cancel_delayed_work_sync(&td->throtl_work);
 958 }
 959
 960 static struct blkio_policy_type blkio_policy_throtl = {
 961         .ops = {
 962                 .blkio_init_group_fn = throtl_init_blkio_group,
 963                 .blkio_link_group_fn = throtl_link_blkio_group,
 964                 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
 965                 .blkio_clear_queue_fn = throtl_clear_queue,
 966                 .blkio_update_group_read_bps_fn =
 967                                         throtl_update_blkio_group_read_bps,
 968                 .blkio_update_group_write_bps_fn =
 969                                         throtl_update_blkio_group_write_bps,
 970                 .blkio_update_group_read_iops_fn =
 971                                         throtl_update_blkio_group_read_iops,
 972                 .blkio_update_group_write_iops_fn =
 973                                         throtl_update_blkio_group_write_iops,
 974         },
 975         .plid = BLKIO_POLICY_THROTL,
 976         .pdata_size = sizeof(struct throtl_grp),
 977 };
 978
 979 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 980 {
 981         struct throtl_data *td = q->td;
 982         struct throtl_grp *tg;
 983         bool rw = bio_data_dir(bio), update_disptime = true;
 984         struct blkio_cgroup *blkcg;
 985         bool throttled = false;
 986
 987         if (bio->bi_rw & REQ_THROTTLED) {
 988                 bio->bi_rw &= ~REQ_THROTTLED;
 989                 goto out;
 990         }
 991
 992         /*
 993          * A throtl_grp pointer retrieved under rcu can be used to access
 994          * basic fields like stats and io rates. If a group has no rules,
 995          * just update the dispatch stats in lockless manner and return.
 996          */
 997         rcu_read_lock();
 998         blkcg = task_blkio_cgroup(current);
 999         tg = throtl_lookup_tg(td, blkcg);
1000         if (tg) {
1001                 if (tg_no_rule_group(tg, rw)) {
1002                         blkiocg_update_dispatch_stats(tg_to_blkg(tg),
1003                                                       bio->bi_size, rw,
1004                                                       rw_is_sync(bio->bi_rw));
1005                         goto out_unlock_rcu;
1006                 }
1007         }
1008
1009         /*
1010          * Either group has not been allocated yet or it is not an unlimited
1011          * IO group
1012          */
1013         spin_lock_irq(q->queue_lock);
1014         tg = throtl_lookup_create_tg(td, blkcg);
1015         if (unlikely(!tg))
1016                 goto out_unlock;
1017
1018         if (tg->nr_queued[rw]) {
1019                 /*
1020                  * There is already another bio queued in same dir. No
1021                  * need to update dispatch time.
1022                  */
1023                 update_disptime = false;
1024                 goto queue_bio;
1025
1026         }
1027
1028         /* Bio is with-in rate limit of group */
1029         if (tg_may_dispatch(td, tg, bio, NULL)) {
1030                 throtl_charge_bio(tg, bio);
1031
1032                 /*
1033                  * We need to trim slice even when bios are not being queued
1034                  * otherwise it might happen that a bio is not queued for
1035                  * a long time and slice keeps on extending and trim is not
1036                  * called for a long time. Now if limits are reduced suddenly
1037                  * we take into account all the IO dispatched so far at new
1038                  * low rate and * newly queued IO gets a really long dispatch
1039                  * time.
1040                  *
1041                  * So keep on trimming slice even if bio is not queued.
1042                  */
1043                 throtl_trim_slice(td, tg, rw);
1044                 goto out_unlock;
1045         }
1046
1047 queue_bio:
1048         throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1049                         " iodisp=%u iops=%u queued=%d/%d",
1050                         rw == READ ? 'R' : 'W',
1051                         tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1052                         tg->io_disp[rw], tg->iops[rw],
1053                         tg->nr_queued[READ], tg->nr_queued[WRITE]);
1054
1055         throtl_add_bio_tg(q->td, tg, bio);
1056         throttled = true;
1057
1058         if (update_disptime) {
1059                 tg_update_disptime(td, tg);
1060                 throtl_schedule_next_dispatch(td);
1061         }
1062
1063 out_unlock:
1064         spin_unlock_irq(q->queue_lock);
1065 out_unlock_rcu:
1066         rcu_read_unlock();
1067 out:
1068         return throttled;
1069 }
1070
1071 /**
1072  * blk_throtl_drain - drain throttled bios
1073  * @q: request_queue to drain throttled bios for
1074  *
1075  * Dispatch all currently throttled bios on @q through ->make_request_fn().
1076  */
1077 void blk_throtl_drain(struct request_queue *q)
1078         __releases(q->queue_lock) __acquires(q->queue_lock)
1079 {
1080         struct throtl_data *td = q->td;
1081         struct throtl_rb_root *st = &td->tg_service_tree;
1082         struct throtl_grp *tg;
1083         struct bio_list bl;
1084         struct bio *bio;
1085
1086         WARN_ON_ONCE(!queue_is_locked(q));
1087
1088         bio_list_init(&bl);
1089
1090         while ((tg = throtl_rb_first(st))) {
1091                 throtl_dequeue_tg(td, tg);
1092
1093                 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1094                         tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1095                 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1096                         tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1097         }
1098         spin_unlock_irq(q->queue_lock);
1099
1100         while ((bio = bio_list_pop(&bl)))
1101                 generic_make_request(bio);
1102
1103         spin_lock_irq(q->queue_lock);
1104 }
1105
1106 int blk_throtl_init(struct request_queue *q)
1107 {
1108         struct throtl_data *td;
1109         struct blkio_group *blkg;
1110
1111         td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1112         if (!td)
1113                 return -ENOMEM;
1114
1115         INIT_HLIST_HEAD(&td->tg_list);
1116         td->tg_service_tree = THROTL_RB_ROOT;
1117         td->limits_changed = false;
1118         INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1119
1120         q->td = td;
1121         td->queue = q;
1122
1123         /* alloc and init root group. */
1124         rcu_read_lock();
1125         spin_lock_irq(q->queue_lock);
1126
1127         blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
1128                                   true);
1129         if (!IS_ERR(blkg))
1130                 td->root_tg = blkg_to_tg(blkg);
1131
1132         spin_unlock_irq(q->queue_lock);
1133         rcu_read_unlock();
1134
1135         if (!td->root_tg) {
1136                 kfree(td);
1137                 return -ENOMEM;
1138         }
1139         return 0;
1140 }
1141
1142 void blk_throtl_exit(struct request_queue *q)
1143 {
1144         struct throtl_data *td = q->td;
1145         bool wait = false;
1146
1147         BUG_ON(!td);
1148
1149         throtl_shutdown_wq(q);
1150
1151         spin_lock_irq(q->queue_lock);
1152         throtl_release_tgs(td, true);
1153
1154         /* If there are other groups */
1155         if (td->nr_undestroyed_grps > 0)
1156                 wait = true;
1157
1158         spin_unlock_irq(q->queue_lock);
1159
1160         /*
1161          * Wait for tg_to_blkg(tg)->q accessors to exit their grace periods.
1162          * Do this wait only if there are other undestroyed groups out
1163          * there (other than root group). This can happen if cgroup deletion
1164          * path claimed the responsibility of cleaning up a group before
1165          * queue cleanup code get to the group.
1166          *
1167          * Do not call synchronize_rcu() unconditionally as there are drivers
1168          * which create/delete request queue hundreds of times during scan/boot
1169          * and synchronize_rcu() can take significant time and slow down boot.
1170          */
1171         if (wait)
1172                 synchronize_rcu();
1173
1174         /*
1175          * Just being safe to make sure after previous flush if some body did
1176          * update limits through cgroup and another work got queued, cancel
1177          * it.
1178          */
1179         throtl_shutdown_wq(q);
1180
1181         kfree(q->td);
1182 }
1183
1184 static int __init throtl_init(void)
1185 {
1186         kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1187         if (!kthrotld_workqueue)
1188                 panic("Failed to create kthrotld\n");
1189
1190         blkio_policy_register(&blkio_policy_throtl);
1191         return 0;
1192 }
1193
1194 module_init(throtl_init);