]> git.karo-electronics.de Git - karo-tx-linux.git/blob - block/cfq-iosched.c
blkcg: kill the mind-bending blkg->dev
[karo-tx-linux.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8  */
9 #include <linux/module.h>
10 #include <linux/slab.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/jiffies.h>
14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h>
16 #include <linux/blktrace_api.h>
17 #include "blk.h"
18 #include "cfq.h"
19
20 /*
21  * tunables
22  */
23 /* max queue in one round of service */
24 static const int cfq_quantum = 8;
25 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26 /* maximum backwards seek, in KiB */
27 static const int cfq_back_max = 16 * 1024;
28 /* penalty of a backwards seek */
29 static const int cfq_back_penalty = 2;
30 static const int cfq_slice_sync = HZ / 10;
31 static int cfq_slice_async = HZ / 25;
32 static const int cfq_slice_async_rq = 2;
33 static int cfq_slice_idle = HZ / 125;
34 static int cfq_group_idle = HZ / 125;
35 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
36 static const int cfq_hist_divisor = 4;
37
38 /*
39  * offset from end of service tree
40  */
41 #define CFQ_IDLE_DELAY          (HZ / 5)
42
43 /*
44  * below this threshold, we consider thinktime immediate
45  */
46 #define CFQ_MIN_TT              (2)
47
48 #define CFQ_SLICE_SCALE         (5)
49 #define CFQ_HW_QUEUE_MIN        (5)
50 #define CFQ_SERVICE_SHIFT       12
51
52 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
53 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
54 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
55 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
56
57 #define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
58 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
59 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
60
61 static struct kmem_cache *cfq_pool;
62
63 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
64 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
65 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
66
67 #define sample_valid(samples)   ((samples) > 80)
68 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
69
70 struct cfq_ttime {
71         unsigned long last_end_request;
72
73         unsigned long ttime_total;
74         unsigned long ttime_samples;
75         unsigned long ttime_mean;
76 };
77
78 /*
79  * Most of our rbtree usage is for sorting with min extraction, so
80  * if we cache the leftmost node we don't have to walk down the tree
81  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82  * move this into the elevator for the rq sorting as well.
83  */
84 struct cfq_rb_root {
85         struct rb_root rb;
86         struct rb_node *left;
87         unsigned count;
88         unsigned total_weight;
89         u64 min_vdisktime;
90         struct cfq_ttime ttime;
91 };
92 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, \
93                         .ttime = {.last_end_request = jiffies,},}
94
95 /*
96  * Per process-grouping structure
97  */
98 struct cfq_queue {
99         /* reference count */
100         int ref;
101         /* various state flags, see below */
102         unsigned int flags;
103         /* parent cfq_data */
104         struct cfq_data *cfqd;
105         /* service_tree member */
106         struct rb_node rb_node;
107         /* service_tree key */
108         unsigned long rb_key;
109         /* prio tree member */
110         struct rb_node p_node;
111         /* prio tree root we belong to, if any */
112         struct rb_root *p_root;
113         /* sorted list of pending requests */
114         struct rb_root sort_list;
115         /* if fifo isn't expired, next request to serve */
116         struct request *next_rq;
117         /* requests queued in sort_list */
118         int queued[2];
119         /* currently allocated requests */
120         int allocated[2];
121         /* fifo list of requests in sort_list */
122         struct list_head fifo;
123
124         /* time when queue got scheduled in to dispatch first request. */
125         unsigned long dispatch_start;
126         unsigned int allocated_slice;
127         unsigned int slice_dispatch;
128         /* time when first request from queue completed and slice started. */
129         unsigned long slice_start;
130         unsigned long slice_end;
131         long slice_resid;
132
133         /* pending priority requests */
134         int prio_pending;
135         /* number of requests that are on the dispatch list or inside driver */
136         int dispatched;
137
138         /* io prio of this group */
139         unsigned short ioprio, org_ioprio;
140         unsigned short ioprio_class;
141
142         pid_t pid;
143
144         u32 seek_history;
145         sector_t last_request_pos;
146
147         struct cfq_rb_root *service_tree;
148         struct cfq_queue *new_cfqq;
149         struct cfq_group *cfqg;
150         /* Number of sectors dispatched from queue in single dispatch round */
151         unsigned long nr_sectors;
152 };
153
154 /*
155  * First index in the service_trees.
156  * IDLE is handled separately, so it has negative index
157  */
158 enum wl_prio_t {
159         BE_WORKLOAD = 0,
160         RT_WORKLOAD = 1,
161         IDLE_WORKLOAD = 2,
162         CFQ_PRIO_NR,
163 };
164
165 /*
166  * Second index in the service_trees.
167  */
168 enum wl_type_t {
169         ASYNC_WORKLOAD = 0,
170         SYNC_NOIDLE_WORKLOAD = 1,
171         SYNC_WORKLOAD = 2
172 };
173
174 /* This is per cgroup per device grouping structure */
175 struct cfq_group {
176         /* group service_tree member */
177         struct rb_node rb_node;
178
179         /* group service_tree key */
180         u64 vdisktime;
181         unsigned int weight;
182         unsigned int new_weight;
183         bool needs_update;
184
185         /* number of cfqq currently on this group */
186         int nr_cfqq;
187
188         /*
189          * Per group busy queues average. Useful for workload slice calc. We
190          * create the array for each prio class but at run time it is used
191          * only for RT and BE class and slot for IDLE class remains unused.
192          * This is primarily done to avoid confusion and a gcc warning.
193          */
194         unsigned int busy_queues_avg[CFQ_PRIO_NR];
195         /*
196          * rr lists of queues with requests. We maintain service trees for
197          * RT and BE classes. These trees are subdivided in subclasses
198          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199          * class there is no subclassification and all the cfq queues go on
200          * a single tree service_tree_idle.
201          * Counts are embedded in the cfq_rb_root
202          */
203         struct cfq_rb_root service_trees[2][3];
204         struct cfq_rb_root service_tree_idle;
205
206         unsigned long saved_workload_slice;
207         enum wl_type_t saved_workload;
208         enum wl_prio_t saved_serving_prio;
209         struct blkio_group blkg;
210 #ifdef CONFIG_CFQ_GROUP_IOSCHED
211         struct hlist_node cfqd_node;
212         int ref;
213 #endif
214         /* number of requests that are on the dispatch list or inside driver */
215         int dispatched;
216         struct cfq_ttime ttime;
217 };
218
219 struct cfq_io_cq {
220         struct io_cq            icq;            /* must be the first member */
221         struct cfq_queue        *cfqq[2];
222         struct cfq_ttime        ttime;
223 };
224
225 /*
226  * Per block device queue structure
227  */
228 struct cfq_data {
229         struct request_queue *queue;
230         /* Root service tree for cfq_groups */
231         struct cfq_rb_root grp_service_tree;
232         struct cfq_group *root_group;
233
234         /*
235          * The priority currently being served
236          */
237         enum wl_prio_t serving_prio;
238         enum wl_type_t serving_type;
239         unsigned long workload_expires;
240         struct cfq_group *serving_group;
241
242         /*
243          * Each priority tree is sorted by next_request position.  These
244          * trees are used when determining if two or more queues are
245          * interleaving requests (see cfq_close_cooperator).
246          */
247         struct rb_root prio_trees[CFQ_PRIO_LISTS];
248
249         unsigned int busy_queues;
250         unsigned int busy_sync_queues;
251
252         int rq_in_driver;
253         int rq_in_flight[2];
254
255         /*
256          * queue-depth detection
257          */
258         int rq_queued;
259         int hw_tag;
260         /*
261          * hw_tag can be
262          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
263          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
264          *  0 => no NCQ
265          */
266         int hw_tag_est_depth;
267         unsigned int hw_tag_samples;
268
269         /*
270          * idle window management
271          */
272         struct timer_list idle_slice_timer;
273         struct work_struct unplug_work;
274
275         struct cfq_queue *active_queue;
276         struct cfq_io_cq *active_cic;
277
278         /*
279          * async queue for each priority case
280          */
281         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
282         struct cfq_queue *async_idle_cfqq;
283
284         sector_t last_position;
285
286         /*
287          * tunables, see top of file
288          */
289         unsigned int cfq_quantum;
290         unsigned int cfq_fifo_expire[2];
291         unsigned int cfq_back_penalty;
292         unsigned int cfq_back_max;
293         unsigned int cfq_slice[2];
294         unsigned int cfq_slice_async_rq;
295         unsigned int cfq_slice_idle;
296         unsigned int cfq_group_idle;
297         unsigned int cfq_latency;
298
299         /*
300          * Fallback dummy cfqq for extreme OOM conditions
301          */
302         struct cfq_queue oom_cfqq;
303
304         unsigned long last_delayed_sync;
305
306         /* List of cfq groups being managed on this device*/
307         struct hlist_head cfqg_list;
308
309         /* Number of groups which are on blkcg->blkg_list */
310         unsigned int nr_blkcg_linked_grps;
311 };
312
313 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
314
315 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
316                                             enum wl_prio_t prio,
317                                             enum wl_type_t type)
318 {
319         if (!cfqg)
320                 return NULL;
321
322         if (prio == IDLE_WORKLOAD)
323                 return &cfqg->service_tree_idle;
324
325         return &cfqg->service_trees[prio][type];
326 }
327
328 enum cfqq_state_flags {
329         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
330         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
331         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
332         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
333         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
334         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
335         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
336         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
337         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
338         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
339         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
340         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
341         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
342 };
343
344 #define CFQ_CFQQ_FNS(name)                                              \
345 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
346 {                                                                       \
347         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
348 }                                                                       \
349 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
350 {                                                                       \
351         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
352 }                                                                       \
353 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
354 {                                                                       \
355         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
356 }
357
358 CFQ_CFQQ_FNS(on_rr);
359 CFQ_CFQQ_FNS(wait_request);
360 CFQ_CFQQ_FNS(must_dispatch);
361 CFQ_CFQQ_FNS(must_alloc_slice);
362 CFQ_CFQQ_FNS(fifo_expire);
363 CFQ_CFQQ_FNS(idle_window);
364 CFQ_CFQQ_FNS(prio_changed);
365 CFQ_CFQQ_FNS(slice_new);
366 CFQ_CFQQ_FNS(sync);
367 CFQ_CFQQ_FNS(coop);
368 CFQ_CFQQ_FNS(split_coop);
369 CFQ_CFQQ_FNS(deep);
370 CFQ_CFQQ_FNS(wait_busy);
371 #undef CFQ_CFQQ_FNS
372
373 #ifdef CONFIG_CFQ_GROUP_IOSCHED
374 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
375         blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
376                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
377                         blkg_path(&(cfqq)->cfqg->blkg), ##args)
378
379 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                          \
380         blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
381                                 blkg_path(&(cfqg)->blkg), ##args)       \
382
383 #else
384 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
385         blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
386 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
387 #endif
388 #define cfq_log(cfqd, fmt, args...)     \
389         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
390
391 /* Traverses through cfq group service trees */
392 #define for_each_cfqg_st(cfqg, i, j, st) \
393         for (i = 0; i <= IDLE_WORKLOAD; i++) \
394                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
395                         : &cfqg->service_tree_idle; \
396                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
397                         (i == IDLE_WORKLOAD && j == 0); \
398                         j++, st = i < IDLE_WORKLOAD ? \
399                         &cfqg->service_trees[i][j]: NULL) \
400
401 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
402         struct cfq_ttime *ttime, bool group_idle)
403 {
404         unsigned long slice;
405         if (!sample_valid(ttime->ttime_samples))
406                 return false;
407         if (group_idle)
408                 slice = cfqd->cfq_group_idle;
409         else
410                 slice = cfqd->cfq_slice_idle;
411         return ttime->ttime_mean > slice;
412 }
413
414 static inline bool iops_mode(struct cfq_data *cfqd)
415 {
416         /*
417          * If we are not idling on queues and it is a NCQ drive, parallel
418          * execution of requests is on and measuring time is not possible
419          * in most of the cases until and unless we drive shallower queue
420          * depths and that becomes a performance bottleneck. In such cases
421          * switch to start providing fairness in terms of number of IOs.
422          */
423         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
424                 return true;
425         else
426                 return false;
427 }
428
429 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
430 {
431         if (cfq_class_idle(cfqq))
432                 return IDLE_WORKLOAD;
433         if (cfq_class_rt(cfqq))
434                 return RT_WORKLOAD;
435         return BE_WORKLOAD;
436 }
437
438
439 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
440 {
441         if (!cfq_cfqq_sync(cfqq))
442                 return ASYNC_WORKLOAD;
443         if (!cfq_cfqq_idle_window(cfqq))
444                 return SYNC_NOIDLE_WORKLOAD;
445         return SYNC_WORKLOAD;
446 }
447
448 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
449                                         struct cfq_data *cfqd,
450                                         struct cfq_group *cfqg)
451 {
452         if (wl == IDLE_WORKLOAD)
453                 return cfqg->service_tree_idle.count;
454
455         return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
456                 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
457                 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
458 }
459
460 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
461                                         struct cfq_group *cfqg)
462 {
463         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
464                 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
465 }
466
467 static void cfq_dispatch_insert(struct request_queue *, struct request *);
468 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
469                                        struct io_context *, gfp_t);
470
471 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
472 {
473         /* cic->icq is the first member, %NULL will convert to %NULL */
474         return container_of(icq, struct cfq_io_cq, icq);
475 }
476
477 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
478                                                struct io_context *ioc)
479 {
480         if (ioc)
481                 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
482         return NULL;
483 }
484
485 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
486 {
487         return cic->cfqq[is_sync];
488 }
489
490 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
491                                 bool is_sync)
492 {
493         cic->cfqq[is_sync] = cfqq;
494 }
495
496 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
497 {
498         return cic->icq.q->elevator->elevator_data;
499 }
500
501 /*
502  * We regard a request as SYNC, if it's either a read or has the SYNC bit
503  * set (in which case it could also be direct WRITE).
504  */
505 static inline bool cfq_bio_sync(struct bio *bio)
506 {
507         return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
508 }
509
510 /*
511  * scheduler run of queue, if there are requests pending and no one in the
512  * driver that will restart queueing
513  */
514 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
515 {
516         if (cfqd->busy_queues) {
517                 cfq_log(cfqd, "schedule dispatch");
518                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
519         }
520 }
521
522 /*
523  * Scale schedule slice based on io priority. Use the sync time slice only
524  * if a queue is marked sync and has sync io queued. A sync queue with async
525  * io only, should not get full sync slice length.
526  */
527 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
528                                  unsigned short prio)
529 {
530         const int base_slice = cfqd->cfq_slice[sync];
531
532         WARN_ON(prio >= IOPRIO_BE_NR);
533
534         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
535 }
536
537 static inline int
538 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
539 {
540         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
541 }
542
543 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
544 {
545         u64 d = delta << CFQ_SERVICE_SHIFT;
546
547         d = d * BLKIO_WEIGHT_DEFAULT;
548         do_div(d, cfqg->weight);
549         return d;
550 }
551
552 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
553 {
554         s64 delta = (s64)(vdisktime - min_vdisktime);
555         if (delta > 0)
556                 min_vdisktime = vdisktime;
557
558         return min_vdisktime;
559 }
560
561 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
562 {
563         s64 delta = (s64)(vdisktime - min_vdisktime);
564         if (delta < 0)
565                 min_vdisktime = vdisktime;
566
567         return min_vdisktime;
568 }
569
570 static void update_min_vdisktime(struct cfq_rb_root *st)
571 {
572         struct cfq_group *cfqg;
573
574         if (st->left) {
575                 cfqg = rb_entry_cfqg(st->left);
576                 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
577                                                   cfqg->vdisktime);
578         }
579 }
580
581 /*
582  * get averaged number of queues of RT/BE priority.
583  * average is updated, with a formula that gives more weight to higher numbers,
584  * to quickly follows sudden increases and decrease slowly
585  */
586
587 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
588                                         struct cfq_group *cfqg, bool rt)
589 {
590         unsigned min_q, max_q;
591         unsigned mult  = cfq_hist_divisor - 1;
592         unsigned round = cfq_hist_divisor / 2;
593         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
594
595         min_q = min(cfqg->busy_queues_avg[rt], busy);
596         max_q = max(cfqg->busy_queues_avg[rt], busy);
597         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
598                 cfq_hist_divisor;
599         return cfqg->busy_queues_avg[rt];
600 }
601
602 static inline unsigned
603 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
604 {
605         struct cfq_rb_root *st = &cfqd->grp_service_tree;
606
607         return cfq_target_latency * cfqg->weight / st->total_weight;
608 }
609
610 static inline unsigned
611 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
612 {
613         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
614         if (cfqd->cfq_latency) {
615                 /*
616                  * interested queues (we consider only the ones with the same
617                  * priority class in the cfq group)
618                  */
619                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
620                                                 cfq_class_rt(cfqq));
621                 unsigned sync_slice = cfqd->cfq_slice[1];
622                 unsigned expect_latency = sync_slice * iq;
623                 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
624
625                 if (expect_latency > group_slice) {
626                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
627                         /* scale low_slice according to IO priority
628                          * and sync vs async */
629                         unsigned low_slice =
630                                 min(slice, base_low_slice * slice / sync_slice);
631                         /* the adapted slice value is scaled to fit all iqs
632                          * into the target latency */
633                         slice = max(slice * group_slice / expect_latency,
634                                     low_slice);
635                 }
636         }
637         return slice;
638 }
639
640 static inline void
641 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
642 {
643         unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
644
645         cfqq->slice_start = jiffies;
646         cfqq->slice_end = jiffies + slice;
647         cfqq->allocated_slice = slice;
648         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
649 }
650
651 /*
652  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
653  * isn't valid until the first request from the dispatch is activated
654  * and the slice time set.
655  */
656 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
657 {
658         if (cfq_cfqq_slice_new(cfqq))
659                 return false;
660         if (time_before(jiffies, cfqq->slice_end))
661                 return false;
662
663         return true;
664 }
665
666 /*
667  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
668  * We choose the request that is closest to the head right now. Distance
669  * behind the head is penalized and only allowed to a certain extent.
670  */
671 static struct request *
672 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
673 {
674         sector_t s1, s2, d1 = 0, d2 = 0;
675         unsigned long back_max;
676 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
677 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
678         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
679
680         if (rq1 == NULL || rq1 == rq2)
681                 return rq2;
682         if (rq2 == NULL)
683                 return rq1;
684
685         if (rq_is_sync(rq1) != rq_is_sync(rq2))
686                 return rq_is_sync(rq1) ? rq1 : rq2;
687
688         if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
689                 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
690
691         s1 = blk_rq_pos(rq1);
692         s2 = blk_rq_pos(rq2);
693
694         /*
695          * by definition, 1KiB is 2 sectors
696          */
697         back_max = cfqd->cfq_back_max * 2;
698
699         /*
700          * Strict one way elevator _except_ in the case where we allow
701          * short backward seeks which are biased as twice the cost of a
702          * similar forward seek.
703          */
704         if (s1 >= last)
705                 d1 = s1 - last;
706         else if (s1 + back_max >= last)
707                 d1 = (last - s1) * cfqd->cfq_back_penalty;
708         else
709                 wrap |= CFQ_RQ1_WRAP;
710
711         if (s2 >= last)
712                 d2 = s2 - last;
713         else if (s2 + back_max >= last)
714                 d2 = (last - s2) * cfqd->cfq_back_penalty;
715         else
716                 wrap |= CFQ_RQ2_WRAP;
717
718         /* Found required data */
719
720         /*
721          * By doing switch() on the bit mask "wrap" we avoid having to
722          * check two variables for all permutations: --> faster!
723          */
724         switch (wrap) {
725         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
726                 if (d1 < d2)
727                         return rq1;
728                 else if (d2 < d1)
729                         return rq2;
730                 else {
731                         if (s1 >= s2)
732                                 return rq1;
733                         else
734                                 return rq2;
735                 }
736
737         case CFQ_RQ2_WRAP:
738                 return rq1;
739         case CFQ_RQ1_WRAP:
740                 return rq2;
741         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
742         default:
743                 /*
744                  * Since both rqs are wrapped,
745                  * start with the one that's further behind head
746                  * (--> only *one* back seek required),
747                  * since back seek takes more time than forward.
748                  */
749                 if (s1 <= s2)
750                         return rq1;
751                 else
752                         return rq2;
753         }
754 }
755
756 /*
757  * The below is leftmost cache rbtree addon
758  */
759 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
760 {
761         /* Service tree is empty */
762         if (!root->count)
763                 return NULL;
764
765         if (!root->left)
766                 root->left = rb_first(&root->rb);
767
768         if (root->left)
769                 return rb_entry(root->left, struct cfq_queue, rb_node);
770
771         return NULL;
772 }
773
774 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
775 {
776         if (!root->left)
777                 root->left = rb_first(&root->rb);
778
779         if (root->left)
780                 return rb_entry_cfqg(root->left);
781
782         return NULL;
783 }
784
785 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
786 {
787         rb_erase(n, root);
788         RB_CLEAR_NODE(n);
789 }
790
791 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
792 {
793         if (root->left == n)
794                 root->left = NULL;
795         rb_erase_init(n, &root->rb);
796         --root->count;
797 }
798
799 /*
800  * would be nice to take fifo expire time into account as well
801  */
802 static struct request *
803 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
804                   struct request *last)
805 {
806         struct rb_node *rbnext = rb_next(&last->rb_node);
807         struct rb_node *rbprev = rb_prev(&last->rb_node);
808         struct request *next = NULL, *prev = NULL;
809
810         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
811
812         if (rbprev)
813                 prev = rb_entry_rq(rbprev);
814
815         if (rbnext)
816                 next = rb_entry_rq(rbnext);
817         else {
818                 rbnext = rb_first(&cfqq->sort_list);
819                 if (rbnext && rbnext != &last->rb_node)
820                         next = rb_entry_rq(rbnext);
821         }
822
823         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
824 }
825
826 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
827                                       struct cfq_queue *cfqq)
828 {
829         /*
830          * just an approximation, should be ok.
831          */
832         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
833                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
834 }
835
836 static inline s64
837 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
838 {
839         return cfqg->vdisktime - st->min_vdisktime;
840 }
841
842 static void
843 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
844 {
845         struct rb_node **node = &st->rb.rb_node;
846         struct rb_node *parent = NULL;
847         struct cfq_group *__cfqg;
848         s64 key = cfqg_key(st, cfqg);
849         int left = 1;
850
851         while (*node != NULL) {
852                 parent = *node;
853                 __cfqg = rb_entry_cfqg(parent);
854
855                 if (key < cfqg_key(st, __cfqg))
856                         node = &parent->rb_left;
857                 else {
858                         node = &parent->rb_right;
859                         left = 0;
860                 }
861         }
862
863         if (left)
864                 st->left = &cfqg->rb_node;
865
866         rb_link_node(&cfqg->rb_node, parent, node);
867         rb_insert_color(&cfqg->rb_node, &st->rb);
868 }
869
870 static void
871 cfq_update_group_weight(struct cfq_group *cfqg)
872 {
873         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
874         if (cfqg->needs_update) {
875                 cfqg->weight = cfqg->new_weight;
876                 cfqg->needs_update = false;
877         }
878 }
879
880 static void
881 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
882 {
883         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
884
885         cfq_update_group_weight(cfqg);
886         __cfq_group_service_tree_add(st, cfqg);
887         st->total_weight += cfqg->weight;
888 }
889
890 static void
891 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
892 {
893         struct cfq_rb_root *st = &cfqd->grp_service_tree;
894         struct cfq_group *__cfqg;
895         struct rb_node *n;
896
897         cfqg->nr_cfqq++;
898         if (!RB_EMPTY_NODE(&cfqg->rb_node))
899                 return;
900
901         /*
902          * Currently put the group at the end. Later implement something
903          * so that groups get lesser vtime based on their weights, so that
904          * if group does not loose all if it was not continuously backlogged.
905          */
906         n = rb_last(&st->rb);
907         if (n) {
908                 __cfqg = rb_entry_cfqg(n);
909                 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
910         } else
911                 cfqg->vdisktime = st->min_vdisktime;
912         cfq_group_service_tree_add(st, cfqg);
913 }
914
915 static void
916 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
917 {
918         st->total_weight -= cfqg->weight;
919         if (!RB_EMPTY_NODE(&cfqg->rb_node))
920                 cfq_rb_erase(&cfqg->rb_node, st);
921 }
922
923 static void
924 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
925 {
926         struct cfq_rb_root *st = &cfqd->grp_service_tree;
927
928         BUG_ON(cfqg->nr_cfqq < 1);
929         cfqg->nr_cfqq--;
930
931         /* If there are other cfq queues under this group, don't delete it */
932         if (cfqg->nr_cfqq)
933                 return;
934
935         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
936         cfq_group_service_tree_del(st, cfqg);
937         cfqg->saved_workload_slice = 0;
938         cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
939 }
940
941 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
942                                                 unsigned int *unaccounted_time)
943 {
944         unsigned int slice_used;
945
946         /*
947          * Queue got expired before even a single request completed or
948          * got expired immediately after first request completion.
949          */
950         if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
951                 /*
952                  * Also charge the seek time incurred to the group, otherwise
953                  * if there are mutiple queues in the group, each can dispatch
954                  * a single request on seeky media and cause lots of seek time
955                  * and group will never know it.
956                  */
957                 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
958                                         1);
959         } else {
960                 slice_used = jiffies - cfqq->slice_start;
961                 if (slice_used > cfqq->allocated_slice) {
962                         *unaccounted_time = slice_used - cfqq->allocated_slice;
963                         slice_used = cfqq->allocated_slice;
964                 }
965                 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
966                         *unaccounted_time += cfqq->slice_start -
967                                         cfqq->dispatch_start;
968         }
969
970         return slice_used;
971 }
972
973 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
974                                 struct cfq_queue *cfqq)
975 {
976         struct cfq_rb_root *st = &cfqd->grp_service_tree;
977         unsigned int used_sl, charge, unaccounted_sl = 0;
978         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
979                         - cfqg->service_tree_idle.count;
980
981         BUG_ON(nr_sync < 0);
982         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
983
984         if (iops_mode(cfqd))
985                 charge = cfqq->slice_dispatch;
986         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
987                 charge = cfqq->allocated_slice;
988
989         /* Can't update vdisktime while group is on service tree */
990         cfq_group_service_tree_del(st, cfqg);
991         cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
992         /* If a new weight was requested, update now, off tree */
993         cfq_group_service_tree_add(st, cfqg);
994
995         /* This group is being expired. Save the context */
996         if (time_after(cfqd->workload_expires, jiffies)) {
997                 cfqg->saved_workload_slice = cfqd->workload_expires
998                                                 - jiffies;
999                 cfqg->saved_workload = cfqd->serving_type;
1000                 cfqg->saved_serving_prio = cfqd->serving_prio;
1001         } else
1002                 cfqg->saved_workload_slice = 0;
1003
1004         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1005                                         st->min_vdisktime);
1006         cfq_log_cfqq(cfqq->cfqd, cfqq,
1007                      "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1008                      used_sl, cfqq->slice_dispatch, charge,
1009                      iops_mode(cfqd), cfqq->nr_sectors);
1010         cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
1011                                           unaccounted_sl);
1012         cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
1013 }
1014
1015 /**
1016  * cfq_init_cfqg_base - initialize base part of a cfq_group
1017  * @cfqg: cfq_group to initialize
1018  *
1019  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1020  * is enabled or not.
1021  */
1022 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1023 {
1024         struct cfq_rb_root *st;
1025         int i, j;
1026
1027         for_each_cfqg_st(cfqg, i, j, st)
1028                 *st = CFQ_RB_ROOT;
1029         RB_CLEAR_NODE(&cfqg->rb_node);
1030
1031         cfqg->ttime.last_end_request = jiffies;
1032 }
1033
1034 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1035 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
1036 {
1037         if (blkg)
1038                 return container_of(blkg, struct cfq_group, blkg);
1039         return NULL;
1040 }
1041
1042 static void cfq_update_blkio_group_weight(struct request_queue *q,
1043                                           struct blkio_group *blkg,
1044                                           unsigned int weight)
1045 {
1046         struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1047         cfqg->new_weight = weight;
1048         cfqg->needs_update = true;
1049 }
1050
1051 static void cfq_link_blkio_group(struct request_queue *q,
1052                                  struct blkio_group *blkg)
1053 {
1054         struct cfq_data *cfqd = q->elevator->elevator_data;
1055         struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1056
1057         cfqd->nr_blkcg_linked_grps++;
1058
1059         /* Add group on cfqd list */
1060         hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1061 }
1062
1063 static struct blkio_group *cfq_alloc_blkio_group(struct request_queue *q,
1064                                                  struct blkio_cgroup *blkcg)
1065 {
1066         struct cfq_group *cfqg;
1067
1068         cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, q->node);
1069         if (!cfqg)
1070                 return NULL;
1071
1072         cfq_init_cfqg_base(cfqg);
1073         cfqg->weight = blkcg->weight;
1074
1075         /*
1076          * Take the initial reference that will be released on destroy
1077          * This can be thought of a joint reference by cgroup and
1078          * elevator which will be dropped by either elevator exit
1079          * or cgroup deletion path depending on who is exiting first.
1080          */
1081         cfqg->ref = 1;
1082
1083         return &cfqg->blkg;
1084 }
1085
1086 /*
1087  * Search for the cfq group current task belongs to. request_queue lock must
1088  * be held.
1089  */
1090 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1091                                                 struct blkio_cgroup *blkcg)
1092 {
1093         struct request_queue *q = cfqd->queue;
1094         struct cfq_group *cfqg = NULL;
1095
1096         /* avoid lookup for the common case where there's no blkio cgroup */
1097         if (blkcg == &blkio_root_cgroup) {
1098                 cfqg = cfqd->root_group;
1099         } else {
1100                 struct blkio_group *blkg;
1101
1102                 blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_PROP, false);
1103                 if (!IS_ERR(blkg))
1104                         cfqg = cfqg_of_blkg(blkg);
1105         }
1106
1107         return cfqg;
1108 }
1109
1110 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1111 {
1112         cfqg->ref++;
1113         return cfqg;
1114 }
1115
1116 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1117 {
1118         /* Currently, all async queues are mapped to root group */
1119         if (!cfq_cfqq_sync(cfqq))
1120                 cfqg = cfqq->cfqd->root_group;
1121
1122         cfqq->cfqg = cfqg;
1123         /* cfqq reference on cfqg */
1124         cfqq->cfqg->ref++;
1125 }
1126
1127 static void cfq_put_cfqg(struct cfq_group *cfqg)
1128 {
1129         struct cfq_rb_root *st;
1130         int i, j;
1131
1132         BUG_ON(cfqg->ref <= 0);
1133         cfqg->ref--;
1134         if (cfqg->ref)
1135                 return;
1136         for_each_cfqg_st(cfqg, i, j, st)
1137                 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1138         free_percpu(cfqg->blkg.stats_cpu);
1139         kfree(cfqg);
1140 }
1141
1142 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1143 {
1144         /* Something wrong if we are trying to remove same group twice */
1145         BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1146
1147         hlist_del_init(&cfqg->cfqd_node);
1148
1149         BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
1150         cfqd->nr_blkcg_linked_grps--;
1151
1152         /*
1153          * Put the reference taken at the time of creation so that when all
1154          * queues are gone, group can be destroyed.
1155          */
1156         cfq_put_cfqg(cfqg);
1157 }
1158
1159 static bool cfq_release_cfq_groups(struct cfq_data *cfqd)
1160 {
1161         struct hlist_node *pos, *n;
1162         struct cfq_group *cfqg;
1163         bool empty = true;
1164
1165         hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1166                 /*
1167                  * If cgroup removal path got to blk_group first and removed
1168                  * it from cgroup list, then it will take care of destroying
1169                  * cfqg also.
1170                  */
1171                 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1172                         cfq_destroy_cfqg(cfqd, cfqg);
1173                 else
1174                         empty = false;
1175         }
1176         return empty;
1177 }
1178
1179 /*
1180  * Blk cgroup controller notification saying that blkio_group object is being
1181  * delinked as associated cgroup object is going away. That also means that
1182  * no new IO will come in this group. So get rid of this group as soon as
1183  * any pending IO in the group is finished.
1184  *
1185  * This function is called under rcu_read_lock(). key is the rcu protected
1186  * pointer. That means @q is a valid request_queue pointer as long as we
1187  * are rcu read lock.
1188  *
1189  * @q was fetched from blkio_group under blkio_cgroup->lock. That means
1190  * it should not be NULL as even if elevator was exiting, cgroup deltion
1191  * path got to it first.
1192  */
1193 static void cfq_unlink_blkio_group(struct request_queue *q,
1194                                    struct blkio_group *blkg)
1195 {
1196         struct cfq_data *cfqd = q->elevator->elevator_data;
1197         unsigned long flags;
1198
1199         spin_lock_irqsave(q->queue_lock, flags);
1200         cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1201         spin_unlock_irqrestore(q->queue_lock, flags);
1202 }
1203
1204 static struct elevator_type iosched_cfq;
1205
1206 static bool cfq_clear_queue(struct request_queue *q)
1207 {
1208         lockdep_assert_held(q->queue_lock);
1209
1210         /* shoot down blkgs iff the current elevator is cfq */
1211         if (!q->elevator || q->elevator->type != &iosched_cfq)
1212                 return true;
1213
1214         return cfq_release_cfq_groups(q->elevator->elevator_data);
1215 }
1216
1217 #else /* GROUP_IOSCHED */
1218 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1219                                                 struct blkio_cgroup *blkcg)
1220 {
1221         return cfqd->root_group;
1222 }
1223
1224 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1225 {
1226         return cfqg;
1227 }
1228
1229 static inline void
1230 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1231         cfqq->cfqg = cfqg;
1232 }
1233
1234 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1235 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1236
1237 #endif /* GROUP_IOSCHED */
1238
1239 /*
1240  * The cfqd->service_trees holds all pending cfq_queue's that have
1241  * requests waiting to be processed. It is sorted in the order that
1242  * we will service the queues.
1243  */
1244 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1245                                  bool add_front)
1246 {
1247         struct rb_node **p, *parent;
1248         struct cfq_queue *__cfqq;
1249         unsigned long rb_key;
1250         struct cfq_rb_root *service_tree;
1251         int left;
1252         int new_cfqq = 1;
1253
1254         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1255                                                 cfqq_type(cfqq));
1256         if (cfq_class_idle(cfqq)) {
1257                 rb_key = CFQ_IDLE_DELAY;
1258                 parent = rb_last(&service_tree->rb);
1259                 if (parent && parent != &cfqq->rb_node) {
1260                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1261                         rb_key += __cfqq->rb_key;
1262                 } else
1263                         rb_key += jiffies;
1264         } else if (!add_front) {
1265                 /*
1266                  * Get our rb key offset. Subtract any residual slice
1267                  * value carried from last service. A negative resid
1268                  * count indicates slice overrun, and this should position
1269                  * the next service time further away in the tree.
1270                  */
1271                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1272                 rb_key -= cfqq->slice_resid;
1273                 cfqq->slice_resid = 0;
1274         } else {
1275                 rb_key = -HZ;
1276                 __cfqq = cfq_rb_first(service_tree);
1277                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1278         }
1279
1280         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1281                 new_cfqq = 0;
1282                 /*
1283                  * same position, nothing more to do
1284                  */
1285                 if (rb_key == cfqq->rb_key &&
1286                     cfqq->service_tree == service_tree)
1287                         return;
1288
1289                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1290                 cfqq->service_tree = NULL;
1291         }
1292
1293         left = 1;
1294         parent = NULL;
1295         cfqq->service_tree = service_tree;
1296         p = &service_tree->rb.rb_node;
1297         while (*p) {
1298                 struct rb_node **n;
1299
1300                 parent = *p;
1301                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1302
1303                 /*
1304                  * sort by key, that represents service time.
1305                  */
1306                 if (time_before(rb_key, __cfqq->rb_key))
1307                         n = &(*p)->rb_left;
1308                 else {
1309                         n = &(*p)->rb_right;
1310                         left = 0;
1311                 }
1312
1313                 p = n;
1314         }
1315
1316         if (left)
1317                 service_tree->left = &cfqq->rb_node;
1318
1319         cfqq->rb_key = rb_key;
1320         rb_link_node(&cfqq->rb_node, parent, p);
1321         rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1322         service_tree->count++;
1323         if (add_front || !new_cfqq)
1324                 return;
1325         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1326 }
1327
1328 static struct cfq_queue *
1329 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1330                      sector_t sector, struct rb_node **ret_parent,
1331                      struct rb_node ***rb_link)
1332 {
1333         struct rb_node **p, *parent;
1334         struct cfq_queue *cfqq = NULL;
1335
1336         parent = NULL;
1337         p = &root->rb_node;
1338         while (*p) {
1339                 struct rb_node **n;
1340
1341                 parent = *p;
1342                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
1343
1344                 /*
1345                  * Sort strictly based on sector.  Smallest to the left,
1346                  * largest to the right.
1347                  */
1348                 if (sector > blk_rq_pos(cfqq->next_rq))
1349                         n = &(*p)->rb_right;
1350                 else if (sector < blk_rq_pos(cfqq->next_rq))
1351                         n = &(*p)->rb_left;
1352                 else
1353                         break;
1354                 p = n;
1355                 cfqq = NULL;
1356         }
1357
1358         *ret_parent = parent;
1359         if (rb_link)
1360                 *rb_link = p;
1361         return cfqq;
1362 }
1363
1364 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1365 {
1366         struct rb_node **p, *parent;
1367         struct cfq_queue *__cfqq;
1368
1369         if (cfqq->p_root) {
1370                 rb_erase(&cfqq->p_node, cfqq->p_root);
1371                 cfqq->p_root = NULL;
1372         }
1373
1374         if (cfq_class_idle(cfqq))
1375                 return;
1376         if (!cfqq->next_rq)
1377                 return;
1378
1379         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1380         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1381                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
1382         if (!__cfqq) {
1383                 rb_link_node(&cfqq->p_node, parent, p);
1384                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
1385         } else
1386                 cfqq->p_root = NULL;
1387 }
1388
1389 /*
1390  * Update cfqq's position in the service tree.
1391  */
1392 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1393 {
1394         /*
1395          * Resorting requires the cfqq to be on the RR list already.
1396          */
1397         if (cfq_cfqq_on_rr(cfqq)) {
1398                 cfq_service_tree_add(cfqd, cfqq, 0);
1399                 cfq_prio_tree_add(cfqd, cfqq);
1400         }
1401 }
1402
1403 /*
1404  * add to busy list of queues for service, trying to be fair in ordering
1405  * the pending list according to last request service
1406  */
1407 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1408 {
1409         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1410         BUG_ON(cfq_cfqq_on_rr(cfqq));
1411         cfq_mark_cfqq_on_rr(cfqq);
1412         cfqd->busy_queues++;
1413         if (cfq_cfqq_sync(cfqq))
1414                 cfqd->busy_sync_queues++;
1415
1416         cfq_resort_rr_list(cfqd, cfqq);
1417 }
1418
1419 /*
1420  * Called when the cfqq no longer has requests pending, remove it from
1421  * the service tree.
1422  */
1423 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1424 {
1425         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1426         BUG_ON(!cfq_cfqq_on_rr(cfqq));
1427         cfq_clear_cfqq_on_rr(cfqq);
1428
1429         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1430                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1431                 cfqq->service_tree = NULL;
1432         }
1433         if (cfqq->p_root) {
1434                 rb_erase(&cfqq->p_node, cfqq->p_root);
1435                 cfqq->p_root = NULL;
1436         }
1437
1438         cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1439         BUG_ON(!cfqd->busy_queues);
1440         cfqd->busy_queues--;
1441         if (cfq_cfqq_sync(cfqq))
1442                 cfqd->busy_sync_queues--;
1443 }
1444
1445 /*
1446  * rb tree support functions
1447  */
1448 static void cfq_del_rq_rb(struct request *rq)
1449 {
1450         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1451         const int sync = rq_is_sync(rq);
1452
1453         BUG_ON(!cfqq->queued[sync]);
1454         cfqq->queued[sync]--;
1455
1456         elv_rb_del(&cfqq->sort_list, rq);
1457
1458         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1459                 /*
1460                  * Queue will be deleted from service tree when we actually
1461                  * expire it later. Right now just remove it from prio tree
1462                  * as it is empty.
1463                  */
1464                 if (cfqq->p_root) {
1465                         rb_erase(&cfqq->p_node, cfqq->p_root);
1466                         cfqq->p_root = NULL;
1467                 }
1468         }
1469 }
1470
1471 static void cfq_add_rq_rb(struct request *rq)
1472 {
1473         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1474         struct cfq_data *cfqd = cfqq->cfqd;
1475         struct request *prev;
1476
1477         cfqq->queued[rq_is_sync(rq)]++;
1478
1479         elv_rb_add(&cfqq->sort_list, rq);
1480
1481         if (!cfq_cfqq_on_rr(cfqq))
1482                 cfq_add_cfqq_rr(cfqd, cfqq);
1483
1484         /*
1485          * check if this request is a better next-serve candidate
1486          */
1487         prev = cfqq->next_rq;
1488         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1489
1490         /*
1491          * adjust priority tree position, if ->next_rq changes
1492          */
1493         if (prev != cfqq->next_rq)
1494                 cfq_prio_tree_add(cfqd, cfqq);
1495
1496         BUG_ON(!cfqq->next_rq);
1497 }
1498
1499 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1500 {
1501         elv_rb_del(&cfqq->sort_list, rq);
1502         cfqq->queued[rq_is_sync(rq)]--;
1503         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1504                                         rq_data_dir(rq), rq_is_sync(rq));
1505         cfq_add_rq_rb(rq);
1506         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1507                         &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1508                         rq_is_sync(rq));
1509 }
1510
1511 static struct request *
1512 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1513 {
1514         struct task_struct *tsk = current;
1515         struct cfq_io_cq *cic;
1516         struct cfq_queue *cfqq;
1517
1518         cic = cfq_cic_lookup(cfqd, tsk->io_context);
1519         if (!cic)
1520                 return NULL;
1521
1522         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1523         if (cfqq) {
1524                 sector_t sector = bio->bi_sector + bio_sectors(bio);
1525
1526                 return elv_rb_find(&cfqq->sort_list, sector);
1527         }
1528
1529         return NULL;
1530 }
1531
1532 static void cfq_activate_request(struct request_queue *q, struct request *rq)
1533 {
1534         struct cfq_data *cfqd = q->elevator->elevator_data;
1535
1536         cfqd->rq_in_driver++;
1537         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1538                                                 cfqd->rq_in_driver);
1539
1540         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1541 }
1542
1543 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1544 {
1545         struct cfq_data *cfqd = q->elevator->elevator_data;
1546
1547         WARN_ON(!cfqd->rq_in_driver);
1548         cfqd->rq_in_driver--;
1549         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1550                                                 cfqd->rq_in_driver);
1551 }
1552
1553 static void cfq_remove_request(struct request *rq)
1554 {
1555         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1556
1557         if (cfqq->next_rq == rq)
1558                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1559
1560         list_del_init(&rq->queuelist);
1561         cfq_del_rq_rb(rq);
1562
1563         cfqq->cfqd->rq_queued--;
1564         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1565                                         rq_data_dir(rq), rq_is_sync(rq));
1566         if (rq->cmd_flags & REQ_PRIO) {
1567                 WARN_ON(!cfqq->prio_pending);
1568                 cfqq->prio_pending--;
1569         }
1570 }
1571
1572 static int cfq_merge(struct request_queue *q, struct request **req,
1573                      struct bio *bio)
1574 {
1575         struct cfq_data *cfqd = q->elevator->elevator_data;
1576         struct request *__rq;
1577
1578         __rq = cfq_find_rq_fmerge(cfqd, bio);
1579         if (__rq && elv_rq_merge_ok(__rq, bio)) {
1580                 *req = __rq;
1581                 return ELEVATOR_FRONT_MERGE;
1582         }
1583
1584         return ELEVATOR_NO_MERGE;
1585 }
1586
1587 static void cfq_merged_request(struct request_queue *q, struct request *req,
1588                                int type)
1589 {
1590         if (type == ELEVATOR_FRONT_MERGE) {
1591                 struct cfq_queue *cfqq = RQ_CFQQ(req);
1592
1593                 cfq_reposition_rq_rb(cfqq, req);
1594         }
1595 }
1596
1597 static void cfq_bio_merged(struct request_queue *q, struct request *req,
1598                                 struct bio *bio)
1599 {
1600         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1601                                         bio_data_dir(bio), cfq_bio_sync(bio));
1602 }
1603
1604 static void
1605 cfq_merged_requests(struct request_queue *q, struct request *rq,
1606                     struct request *next)
1607 {
1608         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1609         struct cfq_data *cfqd = q->elevator->elevator_data;
1610
1611         /*
1612          * reposition in fifo if next is older than rq
1613          */
1614         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1615             time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1616                 list_move(&rq->queuelist, &next->queuelist);
1617                 rq_set_fifo_time(rq, rq_fifo_time(next));
1618         }
1619
1620         if (cfqq->next_rq == next)
1621                 cfqq->next_rq = rq;
1622         cfq_remove_request(next);
1623         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1624                                         rq_data_dir(next), rq_is_sync(next));
1625
1626         cfqq = RQ_CFQQ(next);
1627         /*
1628          * all requests of this queue are merged to other queues, delete it
1629          * from the service tree. If it's the active_queue,
1630          * cfq_dispatch_requests() will choose to expire it or do idle
1631          */
1632         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
1633             cfqq != cfqd->active_queue)
1634                 cfq_del_cfqq_rr(cfqd, cfqq);
1635 }
1636
1637 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1638                            struct bio *bio)
1639 {
1640         struct cfq_data *cfqd = q->elevator->elevator_data;
1641         struct cfq_io_cq *cic;
1642         struct cfq_queue *cfqq;
1643
1644         /*
1645          * Disallow merge of a sync bio into an async request.
1646          */
1647         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1648                 return false;
1649
1650         /*
1651          * Lookup the cfqq that this bio will be queued with and allow
1652          * merge only if rq is queued there.
1653          */
1654         cic = cfq_cic_lookup(cfqd, current->io_context);
1655         if (!cic)
1656                 return false;
1657
1658         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1659         return cfqq == RQ_CFQQ(rq);
1660 }
1661
1662 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1663 {
1664         del_timer(&cfqd->idle_slice_timer);
1665         cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1666 }
1667
1668 static void __cfq_set_active_queue(struct cfq_data *cfqd,
1669                                    struct cfq_queue *cfqq)
1670 {
1671         if (cfqq) {
1672                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1673                                 cfqd->serving_prio, cfqd->serving_type);
1674                 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1675                 cfqq->slice_start = 0;
1676                 cfqq->dispatch_start = jiffies;
1677                 cfqq->allocated_slice = 0;
1678                 cfqq->slice_end = 0;
1679                 cfqq->slice_dispatch = 0;
1680                 cfqq->nr_sectors = 0;
1681
1682                 cfq_clear_cfqq_wait_request(cfqq);
1683                 cfq_clear_cfqq_must_dispatch(cfqq);
1684                 cfq_clear_cfqq_must_alloc_slice(cfqq);
1685                 cfq_clear_cfqq_fifo_expire(cfqq);
1686                 cfq_mark_cfqq_slice_new(cfqq);
1687
1688                 cfq_del_timer(cfqd, cfqq);
1689         }
1690
1691         cfqd->active_queue = cfqq;
1692 }
1693
1694 /*
1695  * current cfqq expired its slice (or was too idle), select new one
1696  */
1697 static void
1698 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1699                     bool timed_out)
1700 {
1701         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1702
1703         if (cfq_cfqq_wait_request(cfqq))
1704                 cfq_del_timer(cfqd, cfqq);
1705
1706         cfq_clear_cfqq_wait_request(cfqq);
1707         cfq_clear_cfqq_wait_busy(cfqq);
1708
1709         /*
1710          * If this cfqq is shared between multiple processes, check to
1711          * make sure that those processes are still issuing I/Os within
1712          * the mean seek distance.  If not, it may be time to break the
1713          * queues apart again.
1714          */
1715         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1716                 cfq_mark_cfqq_split_coop(cfqq);
1717
1718         /*
1719          * store what was left of this slice, if the queue idled/timed out
1720          */
1721         if (timed_out) {
1722                 if (cfq_cfqq_slice_new(cfqq))
1723                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1724                 else
1725                         cfqq->slice_resid = cfqq->slice_end - jiffies;
1726                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1727         }
1728
1729         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1730
1731         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1732                 cfq_del_cfqq_rr(cfqd, cfqq);
1733
1734         cfq_resort_rr_list(cfqd, cfqq);
1735
1736         if (cfqq == cfqd->active_queue)
1737                 cfqd->active_queue = NULL;
1738
1739         if (cfqd->active_cic) {
1740                 put_io_context(cfqd->active_cic->icq.ioc);
1741                 cfqd->active_cic = NULL;
1742         }
1743 }
1744
1745 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1746 {
1747         struct cfq_queue *cfqq = cfqd->active_queue;
1748
1749         if (cfqq)
1750                 __cfq_slice_expired(cfqd, cfqq, timed_out);
1751 }
1752
1753 /*
1754  * Get next queue for service. Unless we have a queue preemption,
1755  * we'll simply select the first cfqq in the service tree.
1756  */
1757 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1758 {
1759         struct cfq_rb_root *service_tree =
1760                 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1761                                         cfqd->serving_type);
1762
1763         if (!cfqd->rq_queued)
1764                 return NULL;
1765
1766         /* There is nothing to dispatch */
1767         if (!service_tree)
1768                 return NULL;
1769         if (RB_EMPTY_ROOT(&service_tree->rb))
1770                 return NULL;
1771         return cfq_rb_first(service_tree);
1772 }
1773
1774 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1775 {
1776         struct cfq_group *cfqg;
1777         struct cfq_queue *cfqq;
1778         int i, j;
1779         struct cfq_rb_root *st;
1780
1781         if (!cfqd->rq_queued)
1782                 return NULL;
1783
1784         cfqg = cfq_get_next_cfqg(cfqd);
1785         if (!cfqg)
1786                 return NULL;
1787
1788         for_each_cfqg_st(cfqg, i, j, st)
1789                 if ((cfqq = cfq_rb_first(st)) != NULL)
1790                         return cfqq;
1791         return NULL;
1792 }
1793
1794 /*
1795  * Get and set a new active queue for service.
1796  */
1797 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1798                                               struct cfq_queue *cfqq)
1799 {
1800         if (!cfqq)
1801                 cfqq = cfq_get_next_queue(cfqd);
1802
1803         __cfq_set_active_queue(cfqd, cfqq);
1804         return cfqq;
1805 }
1806
1807 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1808                                           struct request *rq)
1809 {
1810         if (blk_rq_pos(rq) >= cfqd->last_position)
1811                 return blk_rq_pos(rq) - cfqd->last_position;
1812         else
1813                 return cfqd->last_position - blk_rq_pos(rq);
1814 }
1815
1816 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1817                                struct request *rq)
1818 {
1819         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1820 }
1821
1822 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1823                                     struct cfq_queue *cur_cfqq)
1824 {
1825         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1826         struct rb_node *parent, *node;
1827         struct cfq_queue *__cfqq;
1828         sector_t sector = cfqd->last_position;
1829
1830         if (RB_EMPTY_ROOT(root))
1831                 return NULL;
1832
1833         /*
1834          * First, if we find a request starting at the end of the last
1835          * request, choose it.
1836          */
1837         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1838         if (__cfqq)
1839                 return __cfqq;
1840
1841         /*
1842          * If the exact sector wasn't found, the parent of the NULL leaf
1843          * will contain the closest sector.
1844          */
1845         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1846         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1847                 return __cfqq;
1848
1849         if (blk_rq_pos(__cfqq->next_rq) < sector)
1850                 node = rb_next(&__cfqq->p_node);
1851         else
1852                 node = rb_prev(&__cfqq->p_node);
1853         if (!node)
1854                 return NULL;
1855
1856         __cfqq = rb_entry(node, struct cfq_queue, p_node);
1857         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1858                 return __cfqq;
1859
1860         return NULL;
1861 }
1862
1863 /*
1864  * cfqd - obvious
1865  * cur_cfqq - passed in so that we don't decide that the current queue is
1866  *            closely cooperating with itself.
1867  *
1868  * So, basically we're assuming that that cur_cfqq has dispatched at least
1869  * one request, and that cfqd->last_position reflects a position on the disk
1870  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1871  * assumption.
1872  */
1873 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1874                                               struct cfq_queue *cur_cfqq)
1875 {
1876         struct cfq_queue *cfqq;
1877
1878         if (cfq_class_idle(cur_cfqq))
1879                 return NULL;
1880         if (!cfq_cfqq_sync(cur_cfqq))
1881                 return NULL;
1882         if (CFQQ_SEEKY(cur_cfqq))
1883                 return NULL;
1884
1885         /*
1886          * Don't search priority tree if it's the only queue in the group.
1887          */
1888         if (cur_cfqq->cfqg->nr_cfqq == 1)
1889                 return NULL;
1890
1891         /*
1892          * We should notice if some of the queues are cooperating, eg
1893          * working closely on the same area of the disk. In that case,
1894          * we can group them together and don't waste time idling.
1895          */
1896         cfqq = cfqq_close(cfqd, cur_cfqq);
1897         if (!cfqq)
1898                 return NULL;
1899
1900         /* If new queue belongs to different cfq_group, don't choose it */
1901         if (cur_cfqq->cfqg != cfqq->cfqg)
1902                 return NULL;
1903
1904         /*
1905          * It only makes sense to merge sync queues.
1906          */
1907         if (!cfq_cfqq_sync(cfqq))
1908                 return NULL;
1909         if (CFQQ_SEEKY(cfqq))
1910                 return NULL;
1911
1912         /*
1913          * Do not merge queues of different priority classes
1914          */
1915         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1916                 return NULL;
1917
1918         return cfqq;
1919 }
1920
1921 /*
1922  * Determine whether we should enforce idle window for this queue.
1923  */
1924
1925 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1926 {
1927         enum wl_prio_t prio = cfqq_prio(cfqq);
1928         struct cfq_rb_root *service_tree = cfqq->service_tree;
1929
1930         BUG_ON(!service_tree);
1931         BUG_ON(!service_tree->count);
1932
1933         if (!cfqd->cfq_slice_idle)
1934                 return false;
1935
1936         /* We never do for idle class queues. */
1937         if (prio == IDLE_WORKLOAD)
1938                 return false;
1939
1940         /* We do for queues that were marked with idle window flag. */
1941         if (cfq_cfqq_idle_window(cfqq) &&
1942            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1943                 return true;
1944
1945         /*
1946          * Otherwise, we do only if they are the last ones
1947          * in their service tree.
1948          */
1949         if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
1950            !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
1951                 return true;
1952         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1953                         service_tree->count);
1954         return false;
1955 }
1956
1957 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1958 {
1959         struct cfq_queue *cfqq = cfqd->active_queue;
1960         struct cfq_io_cq *cic;
1961         unsigned long sl, group_idle = 0;
1962
1963         /*
1964          * SSD device without seek penalty, disable idling. But only do so
1965          * for devices that support queuing, otherwise we still have a problem
1966          * with sync vs async workloads.
1967          */
1968         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1969                 return;
1970
1971         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1972         WARN_ON(cfq_cfqq_slice_new(cfqq));
1973
1974         /*
1975          * idle is disabled, either manually or by past process history
1976          */
1977         if (!cfq_should_idle(cfqd, cfqq)) {
1978                 /* no queue idling. Check for group idling */
1979                 if (cfqd->cfq_group_idle)
1980                         group_idle = cfqd->cfq_group_idle;
1981                 else
1982                         return;
1983         }
1984
1985         /*
1986          * still active requests from this queue, don't idle
1987          */
1988         if (cfqq->dispatched)
1989                 return;
1990
1991         /*
1992          * task has exited, don't wait
1993          */
1994         cic = cfqd->active_cic;
1995         if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
1996                 return;
1997
1998         /*
1999          * If our average think time is larger than the remaining time
2000          * slice, then don't idle. This avoids overrunning the allotted
2001          * time slice.
2002          */
2003         if (sample_valid(cic->ttime.ttime_samples) &&
2004             (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2005                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2006                              cic->ttime.ttime_mean);
2007                 return;
2008         }
2009
2010         /* There are other queues in the group, don't do group idle */
2011         if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2012                 return;
2013
2014         cfq_mark_cfqq_wait_request(cfqq);
2015
2016         if (group_idle)
2017                 sl = cfqd->cfq_group_idle;
2018         else
2019                 sl = cfqd->cfq_slice_idle;
2020
2021         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2022         cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
2023         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2024                         group_idle ? 1 : 0);
2025 }
2026
2027 /*
2028  * Move request from internal lists to the request queue dispatch list.
2029  */
2030 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2031 {
2032         struct cfq_data *cfqd = q->elevator->elevator_data;
2033         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2034
2035         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2036
2037         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2038         cfq_remove_request(rq);
2039         cfqq->dispatched++;
2040         (RQ_CFQG(rq))->dispatched++;
2041         elv_dispatch_sort(q, rq);
2042
2043         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2044         cfqq->nr_sectors += blk_rq_sectors(rq);
2045         cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
2046                                         rq_data_dir(rq), rq_is_sync(rq));
2047 }
2048
2049 /*
2050  * return expired entry, or NULL to just start from scratch in rbtree
2051  */
2052 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2053 {
2054         struct request *rq = NULL;
2055
2056         if (cfq_cfqq_fifo_expire(cfqq))
2057                 return NULL;
2058
2059         cfq_mark_cfqq_fifo_expire(cfqq);
2060
2061         if (list_empty(&cfqq->fifo))
2062                 return NULL;
2063
2064         rq = rq_entry_fifo(cfqq->fifo.next);
2065         if (time_before(jiffies, rq_fifo_time(rq)))
2066                 rq = NULL;
2067
2068         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2069         return rq;
2070 }
2071
2072 static inline int
2073 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2074 {
2075         const int base_rq = cfqd->cfq_slice_async_rq;
2076
2077         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2078
2079         return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2080 }
2081
2082 /*
2083  * Must be called with the queue_lock held.
2084  */
2085 static int cfqq_process_refs(struct cfq_queue *cfqq)
2086 {
2087         int process_refs, io_refs;
2088
2089         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2090         process_refs = cfqq->ref - io_refs;
2091         BUG_ON(process_refs < 0);
2092         return process_refs;
2093 }
2094
2095 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2096 {
2097         int process_refs, new_process_refs;
2098         struct cfq_queue *__cfqq;
2099
2100         /*
2101          * If there are no process references on the new_cfqq, then it is
2102          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2103          * chain may have dropped their last reference (not just their
2104          * last process reference).
2105          */
2106         if (!cfqq_process_refs(new_cfqq))
2107                 return;
2108
2109         /* Avoid a circular list and skip interim queue merges */
2110         while ((__cfqq = new_cfqq->new_cfqq)) {
2111                 if (__cfqq == cfqq)
2112                         return;
2113                 new_cfqq = __cfqq;
2114         }
2115
2116         process_refs = cfqq_process_refs(cfqq);
2117         new_process_refs = cfqq_process_refs(new_cfqq);
2118         /*
2119          * If the process for the cfqq has gone away, there is no
2120          * sense in merging the queues.
2121          */
2122         if (process_refs == 0 || new_process_refs == 0)
2123                 return;
2124
2125         /*
2126          * Merge in the direction of the lesser amount of work.
2127          */
2128         if (new_process_refs >= process_refs) {
2129                 cfqq->new_cfqq = new_cfqq;
2130                 new_cfqq->ref += process_refs;
2131         } else {
2132                 new_cfqq->new_cfqq = cfqq;
2133                 cfqq->ref += new_process_refs;
2134         }
2135 }
2136
2137 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2138                                 struct cfq_group *cfqg, enum wl_prio_t prio)
2139 {
2140         struct cfq_queue *queue;
2141         int i;
2142         bool key_valid = false;
2143         unsigned long lowest_key = 0;
2144         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2145
2146         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2147                 /* select the one with lowest rb_key */
2148                 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2149                 if (queue &&
2150                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
2151                         lowest_key = queue->rb_key;
2152                         cur_best = i;
2153                         key_valid = true;
2154                 }
2155         }
2156
2157         return cur_best;
2158 }
2159
2160 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2161 {
2162         unsigned slice;
2163         unsigned count;
2164         struct cfq_rb_root *st;
2165         unsigned group_slice;
2166         enum wl_prio_t original_prio = cfqd->serving_prio;
2167
2168         /* Choose next priority. RT > BE > IDLE */
2169         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2170                 cfqd->serving_prio = RT_WORKLOAD;
2171         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2172                 cfqd->serving_prio = BE_WORKLOAD;
2173         else {
2174                 cfqd->serving_prio = IDLE_WORKLOAD;
2175                 cfqd->workload_expires = jiffies + 1;
2176                 return;
2177         }
2178
2179         if (original_prio != cfqd->serving_prio)
2180                 goto new_workload;
2181
2182         /*
2183          * For RT and BE, we have to choose also the type
2184          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2185          * expiration time
2186          */
2187         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2188         count = st->count;
2189
2190         /*
2191          * check workload expiration, and that we still have other queues ready
2192          */
2193         if (count && !time_after(jiffies, cfqd->workload_expires))
2194                 return;
2195
2196 new_workload:
2197         /* otherwise select new workload type */
2198         cfqd->serving_type =
2199                 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2200         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2201         count = st->count;
2202
2203         /*
2204          * the workload slice is computed as a fraction of target latency
2205          * proportional to the number of queues in that workload, over
2206          * all the queues in the same priority class
2207          */
2208         group_slice = cfq_group_slice(cfqd, cfqg);
2209
2210         slice = group_slice * count /
2211                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2212                       cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2213
2214         if (cfqd->serving_type == ASYNC_WORKLOAD) {
2215                 unsigned int tmp;
2216
2217                 /*
2218                  * Async queues are currently system wide. Just taking
2219                  * proportion of queues with-in same group will lead to higher
2220                  * async ratio system wide as generally root group is going
2221                  * to have higher weight. A more accurate thing would be to
2222                  * calculate system wide asnc/sync ratio.
2223                  */
2224                 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2225                 tmp = tmp/cfqd->busy_queues;
2226                 slice = min_t(unsigned, slice, tmp);
2227
2228                 /* async workload slice is scaled down according to
2229                  * the sync/async slice ratio. */
2230                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2231         } else
2232                 /* sync workload slice is at least 2 * cfq_slice_idle */
2233                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2234
2235         slice = max_t(unsigned, slice, CFQ_MIN_TT);
2236         cfq_log(cfqd, "workload slice:%d", slice);
2237         cfqd->workload_expires = jiffies + slice;
2238 }
2239
2240 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2241 {
2242         struct cfq_rb_root *st = &cfqd->grp_service_tree;
2243         struct cfq_group *cfqg;
2244
2245         if (RB_EMPTY_ROOT(&st->rb))
2246                 return NULL;
2247         cfqg = cfq_rb_first_group(st);
2248         update_min_vdisktime(st);
2249         return cfqg;
2250 }
2251
2252 static void cfq_choose_cfqg(struct cfq_data *cfqd)
2253 {
2254         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2255
2256         cfqd->serving_group = cfqg;
2257
2258         /* Restore the workload type data */
2259         if (cfqg->saved_workload_slice) {
2260                 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2261                 cfqd->serving_type = cfqg->saved_workload;
2262                 cfqd->serving_prio = cfqg->saved_serving_prio;
2263         } else
2264                 cfqd->workload_expires = jiffies - 1;
2265
2266         choose_service_tree(cfqd, cfqg);
2267 }
2268
2269 /*
2270  * Select a queue for service. If we have a current active queue,
2271  * check whether to continue servicing it, or retrieve and set a new one.
2272  */
2273 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2274 {
2275         struct cfq_queue *cfqq, *new_cfqq = NULL;
2276
2277         cfqq = cfqd->active_queue;
2278         if (!cfqq)
2279                 goto new_queue;
2280
2281         if (!cfqd->rq_queued)
2282                 return NULL;
2283
2284         /*
2285          * We were waiting for group to get backlogged. Expire the queue
2286          */
2287         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2288                 goto expire;
2289
2290         /*
2291          * The active queue has run out of time, expire it and select new.
2292          */
2293         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2294                 /*
2295                  * If slice had not expired at the completion of last request
2296                  * we might not have turned on wait_busy flag. Don't expire
2297                  * the queue yet. Allow the group to get backlogged.
2298                  *
2299                  * The very fact that we have used the slice, that means we
2300                  * have been idling all along on this queue and it should be
2301                  * ok to wait for this request to complete.
2302                  */
2303                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2304                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2305                         cfqq = NULL;
2306                         goto keep_queue;
2307                 } else
2308                         goto check_group_idle;
2309         }
2310
2311         /*
2312          * The active queue has requests and isn't expired, allow it to
2313          * dispatch.
2314          */
2315         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2316                 goto keep_queue;
2317
2318         /*
2319          * If another queue has a request waiting within our mean seek
2320          * distance, let it run.  The expire code will check for close
2321          * cooperators and put the close queue at the front of the service
2322          * tree.  If possible, merge the expiring queue with the new cfqq.
2323          */
2324         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2325         if (new_cfqq) {
2326                 if (!cfqq->new_cfqq)
2327                         cfq_setup_merge(cfqq, new_cfqq);
2328                 goto expire;
2329         }
2330
2331         /*
2332          * No requests pending. If the active queue still has requests in
2333          * flight or is idling for a new request, allow either of these
2334          * conditions to happen (or time out) before selecting a new queue.
2335          */
2336         if (timer_pending(&cfqd->idle_slice_timer)) {
2337                 cfqq = NULL;
2338                 goto keep_queue;
2339         }
2340
2341         /*
2342          * This is a deep seek queue, but the device is much faster than
2343          * the queue can deliver, don't idle
2344          **/
2345         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2346             (cfq_cfqq_slice_new(cfqq) ||
2347             (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2348                 cfq_clear_cfqq_deep(cfqq);
2349                 cfq_clear_cfqq_idle_window(cfqq);
2350         }
2351
2352         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2353                 cfqq = NULL;
2354                 goto keep_queue;
2355         }
2356
2357         /*
2358          * If group idle is enabled and there are requests dispatched from
2359          * this group, wait for requests to complete.
2360          */
2361 check_group_idle:
2362         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2363             cfqq->cfqg->dispatched &&
2364             !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2365                 cfqq = NULL;
2366                 goto keep_queue;
2367         }
2368
2369 expire:
2370         cfq_slice_expired(cfqd, 0);
2371 new_queue:
2372         /*
2373          * Current queue expired. Check if we have to switch to a new
2374          * service tree
2375          */
2376         if (!new_cfqq)
2377                 cfq_choose_cfqg(cfqd);
2378
2379         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2380 keep_queue:
2381         return cfqq;
2382 }
2383
2384 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2385 {
2386         int dispatched = 0;
2387
2388         while (cfqq->next_rq) {
2389                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2390                 dispatched++;
2391         }
2392
2393         BUG_ON(!list_empty(&cfqq->fifo));
2394
2395         /* By default cfqq is not expired if it is empty. Do it explicitly */
2396         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2397         return dispatched;
2398 }
2399
2400 /*
2401  * Drain our current requests. Used for barriers and when switching
2402  * io schedulers on-the-fly.
2403  */
2404 static int cfq_forced_dispatch(struct cfq_data *cfqd)
2405 {
2406         struct cfq_queue *cfqq;
2407         int dispatched = 0;
2408
2409         /* Expire the timeslice of the current active queue first */
2410         cfq_slice_expired(cfqd, 0);
2411         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2412                 __cfq_set_active_queue(cfqd, cfqq);
2413                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2414         }
2415
2416         BUG_ON(cfqd->busy_queues);
2417
2418         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2419         return dispatched;
2420 }
2421
2422 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2423         struct cfq_queue *cfqq)
2424 {
2425         /* the queue hasn't finished any request, can't estimate */
2426         if (cfq_cfqq_slice_new(cfqq))
2427                 return true;
2428         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2429                 cfqq->slice_end))
2430                 return true;
2431
2432         return false;
2433 }
2434
2435 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2436 {
2437         unsigned int max_dispatch;
2438
2439         /*
2440          * Drain async requests before we start sync IO
2441          */
2442         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2443                 return false;
2444
2445         /*
2446          * If this is an async queue and we have sync IO in flight, let it wait
2447          */
2448         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2449                 return false;
2450
2451         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2452         if (cfq_class_idle(cfqq))
2453                 max_dispatch = 1;
2454
2455         /*
2456          * Does this cfqq already have too much IO in flight?
2457          */
2458         if (cfqq->dispatched >= max_dispatch) {
2459                 bool promote_sync = false;
2460                 /*
2461                  * idle queue must always only have a single IO in flight
2462                  */
2463                 if (cfq_class_idle(cfqq))
2464                         return false;
2465
2466                 /*
2467                  * If there is only one sync queue
2468                  * we can ignore async queue here and give the sync
2469                  * queue no dispatch limit. The reason is a sync queue can
2470                  * preempt async queue, limiting the sync queue doesn't make
2471                  * sense. This is useful for aiostress test.
2472                  */
2473                 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2474                         promote_sync = true;
2475
2476                 /*
2477                  * We have other queues, don't allow more IO from this one
2478                  */
2479                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2480                                 !promote_sync)
2481                         return false;
2482
2483                 /*
2484                  * Sole queue user, no limit
2485                  */
2486                 if (cfqd->busy_queues == 1 || promote_sync)
2487                         max_dispatch = -1;
2488                 else
2489                         /*
2490                          * Normally we start throttling cfqq when cfq_quantum/2
2491                          * requests have been dispatched. But we can drive
2492                          * deeper queue depths at the beginning of slice
2493                          * subjected to upper limit of cfq_quantum.
2494                          * */
2495                         max_dispatch = cfqd->cfq_quantum;
2496         }
2497
2498         /*
2499          * Async queues must wait a bit before being allowed dispatch.
2500          * We also ramp up the dispatch depth gradually for async IO,
2501          * based on the last sync IO we serviced
2502          */
2503         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2504                 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2505                 unsigned int depth;
2506
2507                 depth = last_sync / cfqd->cfq_slice[1];
2508                 if (!depth && !cfqq->dispatched)
2509                         depth = 1;
2510                 if (depth < max_dispatch)
2511                         max_dispatch = depth;
2512         }
2513
2514         /*
2515          * If we're below the current max, allow a dispatch
2516          */
2517         return cfqq->dispatched < max_dispatch;
2518 }
2519
2520 /*
2521  * Dispatch a request from cfqq, moving them to the request queue
2522  * dispatch list.
2523  */
2524 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2525 {
2526         struct request *rq;
2527
2528         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2529
2530         if (!cfq_may_dispatch(cfqd, cfqq))
2531                 return false;
2532
2533         /*
2534          * follow expired path, else get first next available
2535          */
2536         rq = cfq_check_fifo(cfqq);
2537         if (!rq)
2538                 rq = cfqq->next_rq;
2539
2540         /*
2541          * insert request into driver dispatch list
2542          */
2543         cfq_dispatch_insert(cfqd->queue, rq);
2544
2545         if (!cfqd->active_cic) {
2546                 struct cfq_io_cq *cic = RQ_CIC(rq);
2547
2548                 atomic_long_inc(&cic->icq.ioc->refcount);
2549                 cfqd->active_cic = cic;
2550         }
2551
2552         return true;
2553 }
2554
2555 /*
2556  * Find the cfqq that we need to service and move a request from that to the
2557  * dispatch list
2558  */
2559 static int cfq_dispatch_requests(struct request_queue *q, int force)
2560 {
2561         struct cfq_data *cfqd = q->elevator->elevator_data;
2562         struct cfq_queue *cfqq;
2563
2564         if (!cfqd->busy_queues)
2565                 return 0;
2566
2567         if (unlikely(force))
2568                 return cfq_forced_dispatch(cfqd);
2569
2570         cfqq = cfq_select_queue(cfqd);
2571         if (!cfqq)
2572                 return 0;
2573
2574         /*
2575          * Dispatch a request from this cfqq, if it is allowed
2576          */
2577         if (!cfq_dispatch_request(cfqd, cfqq))
2578                 return 0;
2579
2580         cfqq->slice_dispatch++;
2581         cfq_clear_cfqq_must_dispatch(cfqq);
2582
2583         /*
2584          * expire an async queue immediately if it has used up its slice. idle
2585          * queue always expire after 1 dispatch round.
2586          */
2587         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2588             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2589             cfq_class_idle(cfqq))) {
2590                 cfqq->slice_end = jiffies + 1;
2591                 cfq_slice_expired(cfqd, 0);
2592         }
2593
2594         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2595         return 1;
2596 }
2597
2598 /*
2599  * task holds one reference to the queue, dropped when task exits. each rq
2600  * in-flight on this queue also holds a reference, dropped when rq is freed.
2601  *
2602  * Each cfq queue took a reference on the parent group. Drop it now.
2603  * queue lock must be held here.
2604  */
2605 static void cfq_put_queue(struct cfq_queue *cfqq)
2606 {
2607         struct cfq_data *cfqd = cfqq->cfqd;
2608         struct cfq_group *cfqg;
2609
2610         BUG_ON(cfqq->ref <= 0);
2611
2612         cfqq->ref--;
2613         if (cfqq->ref)
2614                 return;
2615
2616         cfq_log_cfqq(cfqd, cfqq, "put_queue");
2617         BUG_ON(rb_first(&cfqq->sort_list));
2618         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2619         cfqg = cfqq->cfqg;
2620
2621         if (unlikely(cfqd->active_queue == cfqq)) {
2622                 __cfq_slice_expired(cfqd, cfqq, 0);
2623                 cfq_schedule_dispatch(cfqd);
2624         }
2625
2626         BUG_ON(cfq_cfqq_on_rr(cfqq));
2627         kmem_cache_free(cfq_pool, cfqq);
2628         cfq_put_cfqg(cfqg);
2629 }
2630
2631 static void cfq_put_cooperator(struct cfq_queue *cfqq)
2632 {
2633         struct cfq_queue *__cfqq, *next;
2634
2635         /*
2636          * If this queue was scheduled to merge with another queue, be
2637          * sure to drop the reference taken on that queue (and others in
2638          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2639          */
2640         __cfqq = cfqq->new_cfqq;
2641         while (__cfqq) {
2642                 if (__cfqq == cfqq) {
2643                         WARN(1, "cfqq->new_cfqq loop detected\n");
2644                         break;
2645                 }
2646                 next = __cfqq->new_cfqq;
2647                 cfq_put_queue(__cfqq);
2648                 __cfqq = next;
2649         }
2650 }
2651
2652 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2653 {
2654         if (unlikely(cfqq == cfqd->active_queue)) {
2655                 __cfq_slice_expired(cfqd, cfqq, 0);
2656                 cfq_schedule_dispatch(cfqd);
2657         }
2658
2659         cfq_put_cooperator(cfqq);
2660
2661         cfq_put_queue(cfqq);
2662 }
2663
2664 static void cfq_init_icq(struct io_cq *icq)
2665 {
2666         struct cfq_io_cq *cic = icq_to_cic(icq);
2667
2668         cic->ttime.last_end_request = jiffies;
2669 }
2670
2671 static void cfq_exit_icq(struct io_cq *icq)
2672 {
2673         struct cfq_io_cq *cic = icq_to_cic(icq);
2674         struct cfq_data *cfqd = cic_to_cfqd(cic);
2675
2676         if (cic->cfqq[BLK_RW_ASYNC]) {
2677                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2678                 cic->cfqq[BLK_RW_ASYNC] = NULL;
2679         }
2680
2681         if (cic->cfqq[BLK_RW_SYNC]) {
2682                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2683                 cic->cfqq[BLK_RW_SYNC] = NULL;
2684         }
2685 }
2686
2687 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2688 {
2689         struct task_struct *tsk = current;
2690         int ioprio_class;
2691
2692         if (!cfq_cfqq_prio_changed(cfqq))
2693                 return;
2694
2695         ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2696         switch (ioprio_class) {
2697         default:
2698                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2699         case IOPRIO_CLASS_NONE:
2700                 /*
2701                  * no prio set, inherit CPU scheduling settings
2702                  */
2703                 cfqq->ioprio = task_nice_ioprio(tsk);
2704                 cfqq->ioprio_class = task_nice_ioclass(tsk);
2705                 break;
2706         case IOPRIO_CLASS_RT:
2707                 cfqq->ioprio = task_ioprio(ioc);
2708                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2709                 break;
2710         case IOPRIO_CLASS_BE:
2711                 cfqq->ioprio = task_ioprio(ioc);
2712                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2713                 break;
2714         case IOPRIO_CLASS_IDLE:
2715                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2716                 cfqq->ioprio = 7;
2717                 cfq_clear_cfqq_idle_window(cfqq);
2718                 break;
2719         }
2720
2721         /*
2722          * keep track of original prio settings in case we have to temporarily
2723          * elevate the priority of this queue
2724          */
2725         cfqq->org_ioprio = cfqq->ioprio;
2726         cfq_clear_cfqq_prio_changed(cfqq);
2727 }
2728
2729 static void changed_ioprio(struct cfq_io_cq *cic)
2730 {
2731         struct cfq_data *cfqd = cic_to_cfqd(cic);
2732         struct cfq_queue *cfqq;
2733
2734         if (unlikely(!cfqd))
2735                 return;
2736
2737         cfqq = cic->cfqq[BLK_RW_ASYNC];
2738         if (cfqq) {
2739                 struct cfq_queue *new_cfqq;
2740                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
2741                                                 GFP_ATOMIC);
2742                 if (new_cfqq) {
2743                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2744                         cfq_put_queue(cfqq);
2745                 }
2746         }
2747
2748         cfqq = cic->cfqq[BLK_RW_SYNC];
2749         if (cfqq)
2750                 cfq_mark_cfqq_prio_changed(cfqq);
2751 }
2752
2753 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2754                           pid_t pid, bool is_sync)
2755 {
2756         RB_CLEAR_NODE(&cfqq->rb_node);
2757         RB_CLEAR_NODE(&cfqq->p_node);
2758         INIT_LIST_HEAD(&cfqq->fifo);
2759
2760         cfqq->ref = 0;
2761         cfqq->cfqd = cfqd;
2762
2763         cfq_mark_cfqq_prio_changed(cfqq);
2764
2765         if (is_sync) {
2766                 if (!cfq_class_idle(cfqq))
2767                         cfq_mark_cfqq_idle_window(cfqq);
2768                 cfq_mark_cfqq_sync(cfqq);
2769         }
2770         cfqq->pid = pid;
2771 }
2772
2773 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2774 static void changed_cgroup(struct cfq_io_cq *cic)
2775 {
2776         struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2777         struct cfq_data *cfqd = cic_to_cfqd(cic);
2778         struct request_queue *q;
2779
2780         if (unlikely(!cfqd))
2781                 return;
2782
2783         q = cfqd->queue;
2784
2785         if (sync_cfqq) {
2786                 /*
2787                  * Drop reference to sync queue. A new sync queue will be
2788                  * assigned in new group upon arrival of a fresh request.
2789                  */
2790                 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2791                 cic_set_cfqq(cic, NULL, 1);
2792                 cfq_put_queue(sync_cfqq);
2793         }
2794 }
2795 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2796
2797 static struct cfq_queue *
2798 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2799                      struct io_context *ioc, gfp_t gfp_mask)
2800 {
2801         struct blkio_cgroup *blkcg;
2802         struct cfq_queue *cfqq, *new_cfqq = NULL;
2803         struct cfq_io_cq *cic;
2804         struct cfq_group *cfqg;
2805
2806 retry:
2807         rcu_read_lock();
2808
2809         blkcg = task_blkio_cgroup(current);
2810
2811         cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
2812
2813         cic = cfq_cic_lookup(cfqd, ioc);
2814         /* cic always exists here */
2815         cfqq = cic_to_cfqq(cic, is_sync);
2816
2817         /*
2818          * Always try a new alloc if we fell back to the OOM cfqq
2819          * originally, since it should just be a temporary situation.
2820          */
2821         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2822                 cfqq = NULL;
2823                 if (new_cfqq) {
2824                         cfqq = new_cfqq;
2825                         new_cfqq = NULL;
2826                 } else if (gfp_mask & __GFP_WAIT) {
2827                         rcu_read_unlock();
2828                         spin_unlock_irq(cfqd->queue->queue_lock);
2829                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
2830                                         gfp_mask | __GFP_ZERO,
2831                                         cfqd->queue->node);
2832                         spin_lock_irq(cfqd->queue->queue_lock);
2833                         if (new_cfqq)
2834                                 goto retry;
2835                 } else {
2836                         cfqq = kmem_cache_alloc_node(cfq_pool,
2837                                         gfp_mask | __GFP_ZERO,
2838                                         cfqd->queue->node);
2839                 }
2840
2841                 if (cfqq) {
2842                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2843                         cfq_init_prio_data(cfqq, ioc);
2844                         cfq_link_cfqq_cfqg(cfqq, cfqg);
2845                         cfq_log_cfqq(cfqd, cfqq, "alloced");
2846                 } else
2847                         cfqq = &cfqd->oom_cfqq;
2848         }
2849
2850         if (new_cfqq)
2851                 kmem_cache_free(cfq_pool, new_cfqq);
2852
2853         rcu_read_unlock();
2854         return cfqq;
2855 }
2856
2857 static struct cfq_queue **
2858 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2859 {
2860         switch (ioprio_class) {
2861         case IOPRIO_CLASS_RT:
2862                 return &cfqd->async_cfqq[0][ioprio];
2863         case IOPRIO_CLASS_BE:
2864                 return &cfqd->async_cfqq[1][ioprio];
2865         case IOPRIO_CLASS_IDLE:
2866                 return &cfqd->async_idle_cfqq;
2867         default:
2868                 BUG();
2869         }
2870 }
2871
2872 static struct cfq_queue *
2873 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2874               gfp_t gfp_mask)
2875 {
2876         const int ioprio = task_ioprio(ioc);
2877         const int ioprio_class = task_ioprio_class(ioc);
2878         struct cfq_queue **async_cfqq = NULL;
2879         struct cfq_queue *cfqq = NULL;
2880
2881         if (!is_sync) {
2882                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2883                 cfqq = *async_cfqq;
2884         }
2885
2886         if (!cfqq)
2887                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2888
2889         /*
2890          * pin the queue now that it's allocated, scheduler exit will prune it
2891          */
2892         if (!is_sync && !(*async_cfqq)) {
2893                 cfqq->ref++;
2894                 *async_cfqq = cfqq;
2895         }
2896
2897         cfqq->ref++;
2898         return cfqq;
2899 }
2900
2901 static void
2902 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
2903 {
2904         unsigned long elapsed = jiffies - ttime->last_end_request;
2905         elapsed = min(elapsed, 2UL * slice_idle);
2906
2907         ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
2908         ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
2909         ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
2910 }
2911
2912 static void
2913 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2914                         struct cfq_io_cq *cic)
2915 {
2916         if (cfq_cfqq_sync(cfqq)) {
2917                 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
2918                 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
2919                         cfqd->cfq_slice_idle);
2920         }
2921 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2922         __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
2923 #endif
2924 }
2925
2926 static void
2927 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2928                        struct request *rq)
2929 {
2930         sector_t sdist = 0;
2931         sector_t n_sec = blk_rq_sectors(rq);
2932         if (cfqq->last_request_pos) {
2933                 if (cfqq->last_request_pos < blk_rq_pos(rq))
2934                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
2935                 else
2936                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
2937         }
2938
2939         cfqq->seek_history <<= 1;
2940         if (blk_queue_nonrot(cfqd->queue))
2941                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
2942         else
2943                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
2944 }
2945
2946 /*
2947  * Disable idle window if the process thinks too long or seeks so much that
2948  * it doesn't matter
2949  */
2950 static void
2951 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2952                        struct cfq_io_cq *cic)
2953 {
2954         int old_idle, enable_idle;
2955
2956         /*
2957          * Don't idle for async or idle io prio class
2958          */
2959         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
2960                 return;
2961
2962         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
2963
2964         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
2965                 cfq_mark_cfqq_deep(cfqq);
2966
2967         if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
2968                 enable_idle = 0;
2969         else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
2970                  !cfqd->cfq_slice_idle ||
2971                  (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
2972                 enable_idle = 0;
2973         else if (sample_valid(cic->ttime.ttime_samples)) {
2974                 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
2975                         enable_idle = 0;
2976                 else
2977                         enable_idle = 1;
2978         }
2979
2980         if (old_idle != enable_idle) {
2981                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
2982                 if (enable_idle)
2983                         cfq_mark_cfqq_idle_window(cfqq);
2984                 else
2985                         cfq_clear_cfqq_idle_window(cfqq);
2986         }
2987 }
2988
2989 /*
2990  * Check if new_cfqq should preempt the currently active queue. Return 0 for
2991  * no or if we aren't sure, a 1 will cause a preempt.
2992  */
2993 static bool
2994 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
2995                    struct request *rq)
2996 {
2997         struct cfq_queue *cfqq;
2998
2999         cfqq = cfqd->active_queue;
3000         if (!cfqq)
3001                 return false;
3002
3003         if (cfq_class_idle(new_cfqq))
3004                 return false;
3005
3006         if (cfq_class_idle(cfqq))
3007                 return true;
3008
3009         /*
3010          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3011          */
3012         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3013                 return false;
3014
3015         /*
3016          * if the new request is sync, but the currently running queue is
3017          * not, let the sync request have priority.
3018          */
3019         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3020                 return true;
3021
3022         if (new_cfqq->cfqg != cfqq->cfqg)
3023                 return false;
3024
3025         if (cfq_slice_used(cfqq))
3026                 return true;
3027
3028         /* Allow preemption only if we are idling on sync-noidle tree */
3029         if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3030             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3031             new_cfqq->service_tree->count == 2 &&
3032             RB_EMPTY_ROOT(&cfqq->sort_list))
3033                 return true;
3034
3035         /*
3036          * So both queues are sync. Let the new request get disk time if
3037          * it's a metadata request and the current queue is doing regular IO.
3038          */
3039         if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3040                 return true;
3041
3042         /*
3043          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3044          */
3045         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3046                 return true;
3047
3048         /* An idle queue should not be idle now for some reason */
3049         if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3050                 return true;
3051
3052         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3053                 return false;
3054
3055         /*
3056          * if this request is as-good as one we would expect from the
3057          * current cfqq, let it preempt
3058          */
3059         if (cfq_rq_close(cfqd, cfqq, rq))
3060                 return true;
3061
3062         return false;
3063 }
3064
3065 /*
3066  * cfqq preempts the active queue. if we allowed preempt with no slice left,
3067  * let it have half of its nominal slice.
3068  */
3069 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3070 {
3071         enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3072
3073         cfq_log_cfqq(cfqd, cfqq, "preempt");
3074         cfq_slice_expired(cfqd, 1);
3075
3076         /*
3077          * workload type is changed, don't save slice, otherwise preempt
3078          * doesn't happen
3079          */
3080         if (old_type != cfqq_type(cfqq))
3081                 cfqq->cfqg->saved_workload_slice = 0;
3082
3083         /*
3084          * Put the new queue at the front of the of the current list,
3085          * so we know that it will be selected next.
3086          */
3087         BUG_ON(!cfq_cfqq_on_rr(cfqq));
3088
3089         cfq_service_tree_add(cfqd, cfqq, 1);
3090
3091         cfqq->slice_end = 0;
3092         cfq_mark_cfqq_slice_new(cfqq);
3093 }
3094
3095 /*
3096  * Called when a new fs request (rq) is added (to cfqq). Check if there's
3097  * something we should do about it
3098  */
3099 static void
3100 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3101                 struct request *rq)
3102 {
3103         struct cfq_io_cq *cic = RQ_CIC(rq);
3104
3105         cfqd->rq_queued++;
3106         if (rq->cmd_flags & REQ_PRIO)
3107                 cfqq->prio_pending++;
3108
3109         cfq_update_io_thinktime(cfqd, cfqq, cic);
3110         cfq_update_io_seektime(cfqd, cfqq, rq);
3111         cfq_update_idle_window(cfqd, cfqq, cic);
3112
3113         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3114
3115         if (cfqq == cfqd->active_queue) {
3116                 /*
3117                  * Remember that we saw a request from this process, but
3118                  * don't start queuing just yet. Otherwise we risk seeing lots
3119                  * of tiny requests, because we disrupt the normal plugging
3120                  * and merging. If the request is already larger than a single
3121                  * page, let it rip immediately. For that case we assume that
3122                  * merging is already done. Ditto for a busy system that
3123                  * has other work pending, don't risk delaying until the
3124                  * idle timer unplug to continue working.
3125                  */
3126                 if (cfq_cfqq_wait_request(cfqq)) {
3127                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3128                             cfqd->busy_queues > 1) {
3129                                 cfq_del_timer(cfqd, cfqq);
3130                                 cfq_clear_cfqq_wait_request(cfqq);
3131                                 __blk_run_queue(cfqd->queue);
3132                         } else {
3133                                 cfq_blkiocg_update_idle_time_stats(
3134                                                 &cfqq->cfqg->blkg);
3135                                 cfq_mark_cfqq_must_dispatch(cfqq);
3136                         }
3137                 }
3138         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3139                 /*
3140                  * not the active queue - expire current slice if it is
3141                  * idle and has expired it's mean thinktime or this new queue
3142                  * has some old slice time left and is of higher priority or
3143                  * this new queue is RT and the current one is BE
3144                  */
3145                 cfq_preempt_queue(cfqd, cfqq);
3146                 __blk_run_queue(cfqd->queue);
3147         }
3148 }
3149
3150 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3151 {
3152         struct cfq_data *cfqd = q->elevator->elevator_data;
3153         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3154
3155         cfq_log_cfqq(cfqd, cfqq, "insert_request");
3156         cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
3157
3158         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3159         list_add_tail(&rq->queuelist, &cfqq->fifo);
3160         cfq_add_rq_rb(rq);
3161         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3162                         &cfqd->serving_group->blkg, rq_data_dir(rq),
3163                         rq_is_sync(rq));
3164         cfq_rq_enqueued(cfqd, cfqq, rq);
3165 }
3166
3167 /*
3168  * Update hw_tag based on peak queue depth over 50 samples under
3169  * sufficient load.
3170  */
3171 static void cfq_update_hw_tag(struct cfq_data *cfqd)
3172 {
3173         struct cfq_queue *cfqq = cfqd->active_queue;
3174
3175         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3176                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3177
3178         if (cfqd->hw_tag == 1)
3179                 return;
3180
3181         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3182             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3183                 return;
3184
3185         /*
3186          * If active queue hasn't enough requests and can idle, cfq might not
3187          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3188          * case
3189          */
3190         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3191             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3192             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3193                 return;
3194
3195         if (cfqd->hw_tag_samples++ < 50)
3196                 return;
3197
3198         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3199                 cfqd->hw_tag = 1;
3200         else
3201                 cfqd->hw_tag = 0;
3202 }
3203
3204 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3205 {
3206         struct cfq_io_cq *cic = cfqd->active_cic;
3207
3208         /* If the queue already has requests, don't wait */
3209         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3210                 return false;
3211
3212         /* If there are other queues in the group, don't wait */
3213         if (cfqq->cfqg->nr_cfqq > 1)
3214                 return false;
3215
3216         /* the only queue in the group, but think time is big */
3217         if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3218                 return false;
3219
3220         if (cfq_slice_used(cfqq))
3221                 return true;
3222
3223         /* if slice left is less than think time, wait busy */
3224         if (cic && sample_valid(cic->ttime.ttime_samples)
3225             && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3226                 return true;
3227
3228         /*
3229          * If think times is less than a jiffy than ttime_mean=0 and above
3230          * will not be true. It might happen that slice has not expired yet
3231          * but will expire soon (4-5 ns) during select_queue(). To cover the
3232          * case where think time is less than a jiffy, mark the queue wait
3233          * busy if only 1 jiffy is left in the slice.
3234          */
3235         if (cfqq->slice_end - jiffies == 1)
3236                 return true;
3237
3238         return false;
3239 }
3240
3241 static void cfq_completed_request(struct request_queue *q, struct request *rq)
3242 {
3243         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3244         struct cfq_data *cfqd = cfqq->cfqd;
3245         const int sync = rq_is_sync(rq);
3246         unsigned long now;
3247
3248         now = jiffies;
3249         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3250                      !!(rq->cmd_flags & REQ_NOIDLE));
3251
3252         cfq_update_hw_tag(cfqd);
3253
3254         WARN_ON(!cfqd->rq_in_driver);
3255         WARN_ON(!cfqq->dispatched);
3256         cfqd->rq_in_driver--;
3257         cfqq->dispatched--;
3258         (RQ_CFQG(rq))->dispatched--;
3259         cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3260                         rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3261                         rq_data_dir(rq), rq_is_sync(rq));
3262
3263         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3264
3265         if (sync) {
3266                 struct cfq_rb_root *service_tree;
3267
3268                 RQ_CIC(rq)->ttime.last_end_request = now;
3269
3270                 if (cfq_cfqq_on_rr(cfqq))
3271                         service_tree = cfqq->service_tree;
3272                 else
3273                         service_tree = service_tree_for(cfqq->cfqg,
3274                                 cfqq_prio(cfqq), cfqq_type(cfqq));
3275                 service_tree->ttime.last_end_request = now;
3276                 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3277                         cfqd->last_delayed_sync = now;
3278         }
3279
3280 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3281         cfqq->cfqg->ttime.last_end_request = now;
3282 #endif
3283
3284         /*
3285          * If this is the active queue, check if it needs to be expired,
3286          * or if we want to idle in case it has no pending requests.
3287          */
3288         if (cfqd->active_queue == cfqq) {
3289                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3290
3291                 if (cfq_cfqq_slice_new(cfqq)) {
3292                         cfq_set_prio_slice(cfqd, cfqq);
3293                         cfq_clear_cfqq_slice_new(cfqq);
3294                 }
3295
3296                 /*
3297                  * Should we wait for next request to come in before we expire
3298                  * the queue.
3299                  */
3300                 if (cfq_should_wait_busy(cfqd, cfqq)) {
3301                         unsigned long extend_sl = cfqd->cfq_slice_idle;
3302                         if (!cfqd->cfq_slice_idle)
3303                                 extend_sl = cfqd->cfq_group_idle;
3304                         cfqq->slice_end = jiffies + extend_sl;
3305                         cfq_mark_cfqq_wait_busy(cfqq);
3306                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3307                 }
3308
3309                 /*
3310                  * Idling is not enabled on:
3311                  * - expired queues
3312                  * - idle-priority queues
3313                  * - async queues
3314                  * - queues with still some requests queued
3315                  * - when there is a close cooperator
3316                  */
3317                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3318                         cfq_slice_expired(cfqd, 1);
3319                 else if (sync && cfqq_empty &&
3320                          !cfq_close_cooperator(cfqd, cfqq)) {
3321                         cfq_arm_slice_timer(cfqd);
3322                 }
3323         }
3324
3325         if (!cfqd->rq_in_driver)
3326                 cfq_schedule_dispatch(cfqd);
3327 }
3328
3329 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3330 {
3331         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3332                 cfq_mark_cfqq_must_alloc_slice(cfqq);
3333                 return ELV_MQUEUE_MUST;
3334         }
3335
3336         return ELV_MQUEUE_MAY;
3337 }
3338
3339 static int cfq_may_queue(struct request_queue *q, int rw)
3340 {
3341         struct cfq_data *cfqd = q->elevator->elevator_data;
3342         struct task_struct *tsk = current;
3343         struct cfq_io_cq *cic;
3344         struct cfq_queue *cfqq;
3345
3346         /*
3347          * don't force setup of a queue from here, as a call to may_queue
3348          * does not necessarily imply that a request actually will be queued.
3349          * so just lookup a possibly existing queue, or return 'may queue'
3350          * if that fails
3351          */
3352         cic = cfq_cic_lookup(cfqd, tsk->io_context);
3353         if (!cic)
3354                 return ELV_MQUEUE_MAY;
3355
3356         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3357         if (cfqq) {
3358                 cfq_init_prio_data(cfqq, cic->icq.ioc);
3359
3360                 return __cfq_may_queue(cfqq);
3361         }
3362
3363         return ELV_MQUEUE_MAY;
3364 }
3365
3366 /*
3367  * queue lock held here
3368  */
3369 static void cfq_put_request(struct request *rq)
3370 {
3371         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3372
3373         if (cfqq) {
3374                 const int rw = rq_data_dir(rq);
3375
3376                 BUG_ON(!cfqq->allocated[rw]);
3377                 cfqq->allocated[rw]--;
3378
3379                 /* Put down rq reference on cfqg */
3380                 cfq_put_cfqg(RQ_CFQG(rq));
3381                 rq->elv.priv[0] = NULL;
3382                 rq->elv.priv[1] = NULL;
3383
3384                 cfq_put_queue(cfqq);
3385         }
3386 }
3387
3388 static struct cfq_queue *
3389 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3390                 struct cfq_queue *cfqq)
3391 {
3392         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3393         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3394         cfq_mark_cfqq_coop(cfqq->new_cfqq);
3395         cfq_put_queue(cfqq);
3396         return cic_to_cfqq(cic, 1);
3397 }
3398
3399 /*
3400  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3401  * was the last process referring to said cfqq.
3402  */
3403 static struct cfq_queue *
3404 split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3405 {
3406         if (cfqq_process_refs(cfqq) == 1) {
3407                 cfqq->pid = current->pid;
3408                 cfq_clear_cfqq_coop(cfqq);
3409                 cfq_clear_cfqq_split_coop(cfqq);
3410                 return cfqq;
3411         }
3412
3413         cic_set_cfqq(cic, NULL, 1);
3414
3415         cfq_put_cooperator(cfqq);
3416
3417         cfq_put_queue(cfqq);
3418         return NULL;
3419 }
3420 /*
3421  * Allocate cfq data structures associated with this request.
3422  */
3423 static int
3424 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3425 {
3426         struct cfq_data *cfqd = q->elevator->elevator_data;
3427         struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3428         const int rw = rq_data_dir(rq);
3429         const bool is_sync = rq_is_sync(rq);
3430         struct cfq_queue *cfqq;
3431         unsigned int changed;
3432
3433         might_sleep_if(gfp_mask & __GFP_WAIT);
3434
3435         spin_lock_irq(q->queue_lock);
3436
3437         /* handle changed notifications */
3438         changed = icq_get_changed(&cic->icq);
3439         if (unlikely(changed & ICQ_IOPRIO_CHANGED))
3440                 changed_ioprio(cic);
3441 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3442         if (unlikely(changed & ICQ_CGROUP_CHANGED))
3443                 changed_cgroup(cic);
3444 #endif
3445
3446 new_queue:
3447         cfqq = cic_to_cfqq(cic, is_sync);
3448         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3449                 cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
3450                 cic_set_cfqq(cic, cfqq, is_sync);
3451         } else {
3452                 /*
3453                  * If the queue was seeky for too long, break it apart.
3454                  */
3455                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3456                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3457                         cfqq = split_cfqq(cic, cfqq);
3458                         if (!cfqq)
3459                                 goto new_queue;
3460                 }
3461
3462                 /*
3463                  * Check to see if this queue is scheduled to merge with
3464                  * another, closely cooperating queue.  The merging of
3465                  * queues happens here as it must be done in process context.
3466                  * The reference on new_cfqq was taken in merge_cfqqs.
3467                  */
3468                 if (cfqq->new_cfqq)
3469                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3470         }
3471
3472         cfqq->allocated[rw]++;
3473
3474         cfqq->ref++;
3475         rq->elv.priv[0] = cfqq;
3476         rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
3477         spin_unlock_irq(q->queue_lock);
3478         return 0;
3479 }
3480
3481 static void cfq_kick_queue(struct work_struct *work)
3482 {
3483         struct cfq_data *cfqd =
3484                 container_of(work, struct cfq_data, unplug_work);
3485         struct request_queue *q = cfqd->queue;
3486
3487         spin_lock_irq(q->queue_lock);
3488         __blk_run_queue(cfqd->queue);
3489         spin_unlock_irq(q->queue_lock);
3490 }
3491
3492 /*
3493  * Timer running if the active_queue is currently idling inside its time slice
3494  */
3495 static void cfq_idle_slice_timer(unsigned long data)
3496 {
3497         struct cfq_data *cfqd = (struct cfq_data *) data;
3498         struct cfq_queue *cfqq;
3499         unsigned long flags;
3500         int timed_out = 1;
3501
3502         cfq_log(cfqd, "idle timer fired");
3503
3504         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3505
3506         cfqq = cfqd->active_queue;
3507         if (cfqq) {
3508                 timed_out = 0;
3509
3510                 /*
3511                  * We saw a request before the queue expired, let it through
3512                  */
3513                 if (cfq_cfqq_must_dispatch(cfqq))
3514                         goto out_kick;
3515
3516                 /*
3517                  * expired
3518                  */
3519                 if (cfq_slice_used(cfqq))
3520                         goto expire;
3521
3522                 /*
3523                  * only expire and reinvoke request handler, if there are
3524                  * other queues with pending requests
3525                  */
3526                 if (!cfqd->busy_queues)
3527                         goto out_cont;
3528
3529                 /*
3530                  * not expired and it has a request pending, let it dispatch
3531                  */
3532                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3533                         goto out_kick;
3534
3535                 /*
3536                  * Queue depth flag is reset only when the idle didn't succeed
3537                  */
3538                 cfq_clear_cfqq_deep(cfqq);
3539         }
3540 expire:
3541         cfq_slice_expired(cfqd, timed_out);
3542 out_kick:
3543         cfq_schedule_dispatch(cfqd);
3544 out_cont:
3545         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3546 }
3547
3548 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3549 {
3550         del_timer_sync(&cfqd->idle_slice_timer);
3551         cancel_work_sync(&cfqd->unplug_work);
3552 }
3553
3554 static void cfq_put_async_queues(struct cfq_data *cfqd)
3555 {
3556         int i;
3557
3558         for (i = 0; i < IOPRIO_BE_NR; i++) {
3559                 if (cfqd->async_cfqq[0][i])
3560                         cfq_put_queue(cfqd->async_cfqq[0][i]);
3561                 if (cfqd->async_cfqq[1][i])
3562                         cfq_put_queue(cfqd->async_cfqq[1][i]);
3563         }
3564
3565         if (cfqd->async_idle_cfqq)
3566                 cfq_put_queue(cfqd->async_idle_cfqq);
3567 }
3568
3569 static void cfq_exit_queue(struct elevator_queue *e)
3570 {
3571         struct cfq_data *cfqd = e->elevator_data;
3572         struct request_queue *q = cfqd->queue;
3573         bool wait = false;
3574
3575         cfq_shutdown_timer_wq(cfqd);
3576
3577         spin_lock_irq(q->queue_lock);
3578
3579         if (cfqd->active_queue)
3580                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3581
3582         cfq_put_async_queues(cfqd);
3583         cfq_release_cfq_groups(cfqd);
3584
3585         /*
3586          * If there are groups which we could not unlink from blkcg list,
3587          * wait for a rcu period for them to be freed.
3588          */
3589         if (cfqd->nr_blkcg_linked_grps)
3590                 wait = true;
3591
3592         spin_unlock_irq(q->queue_lock);
3593
3594         cfq_shutdown_timer_wq(cfqd);
3595
3596         /*
3597          * Wait for cfqg->blkg->key accessors to exit their grace periods.
3598          * Do this wait only if there are other unlinked groups out
3599          * there. This can happen if cgroup deletion path claimed the
3600          * responsibility of cleaning up a group before queue cleanup code
3601          * get to the group.
3602          *
3603          * Do not call synchronize_rcu() unconditionally as there are drivers
3604          * which create/delete request queue hundreds of times during scan/boot
3605          * and synchronize_rcu() can take significant time and slow down boot.
3606          */
3607         if (wait)
3608                 synchronize_rcu();
3609
3610 #ifndef CONFIG_CFQ_GROUP_IOSCHED
3611         kfree(cfqd->root_group);
3612 #endif
3613         kfree(cfqd);
3614 }
3615
3616 static int cfq_init_queue(struct request_queue *q)
3617 {
3618         struct cfq_data *cfqd;
3619         struct blkio_group *blkg __maybe_unused;
3620         int i;
3621
3622         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3623         if (!cfqd)
3624                 return -ENOMEM;
3625
3626         cfqd->queue = q;
3627         q->elevator->elevator_data = cfqd;
3628
3629         /* Init root service tree */
3630         cfqd->grp_service_tree = CFQ_RB_ROOT;
3631
3632         /* Init root group and prefer root group over other groups by default */
3633 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3634         rcu_read_lock();
3635         spin_lock_irq(q->queue_lock);
3636
3637         blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_PROP,
3638                                   true);
3639         if (!IS_ERR(blkg))
3640                 cfqd->root_group = cfqg_of_blkg(blkg);
3641
3642         spin_unlock_irq(q->queue_lock);
3643         rcu_read_unlock();
3644 #else
3645         cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
3646                                         GFP_KERNEL, cfqd->queue->node);
3647         if (cfqd->root_group)
3648                 cfq_init_cfqg_base(cfqd->root_group);
3649 #endif
3650         if (!cfqd->root_group) {
3651                 kfree(cfqd);
3652                 return -ENOMEM;
3653         }
3654
3655         cfqd->root_group->weight = 2*BLKIO_WEIGHT_DEFAULT;
3656
3657         /*
3658          * Not strictly needed (since RB_ROOT just clears the node and we
3659          * zeroed cfqd on alloc), but better be safe in case someone decides
3660          * to add magic to the rb code
3661          */
3662         for (i = 0; i < CFQ_PRIO_LISTS; i++)
3663                 cfqd->prio_trees[i] = RB_ROOT;
3664
3665         /*
3666          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3667          * Grab a permanent reference to it, so that the normal code flow
3668          * will not attempt to free it.  oom_cfqq is linked to root_group
3669          * but shouldn't hold a reference as it'll never be unlinked.  Lose
3670          * the reference from linking right away.
3671          */
3672         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3673         cfqd->oom_cfqq.ref++;
3674         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
3675         cfq_put_cfqg(cfqd->root_group);
3676
3677         init_timer(&cfqd->idle_slice_timer);
3678         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3679         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3680
3681         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3682
3683         cfqd->cfq_quantum = cfq_quantum;
3684         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3685         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3686         cfqd->cfq_back_max = cfq_back_max;
3687         cfqd->cfq_back_penalty = cfq_back_penalty;
3688         cfqd->cfq_slice[0] = cfq_slice_async;
3689         cfqd->cfq_slice[1] = cfq_slice_sync;
3690         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3691         cfqd->cfq_slice_idle = cfq_slice_idle;
3692         cfqd->cfq_group_idle = cfq_group_idle;
3693         cfqd->cfq_latency = 1;
3694         cfqd->hw_tag = -1;
3695         /*
3696          * we optimistically start assuming sync ops weren't delayed in last
3697          * second, in order to have larger depth for async operations.
3698          */
3699         cfqd->last_delayed_sync = jiffies - HZ;
3700         return 0;
3701 }
3702
3703 /*
3704  * sysfs parts below -->
3705  */
3706 static ssize_t
3707 cfq_var_show(unsigned int var, char *page)
3708 {
3709         return sprintf(page, "%d\n", var);
3710 }
3711
3712 static ssize_t
3713 cfq_var_store(unsigned int *var, const char *page, size_t count)
3714 {
3715         char *p = (char *) page;
3716
3717         *var = simple_strtoul(p, &p, 10);
3718         return count;
3719 }
3720
3721 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
3722 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
3723 {                                                                       \
3724         struct cfq_data *cfqd = e->elevator_data;                       \
3725         unsigned int __data = __VAR;                                    \
3726         if (__CONV)                                                     \
3727                 __data = jiffies_to_msecs(__data);                      \
3728         return cfq_var_show(__data, (page));                            \
3729 }
3730 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3731 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3732 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3733 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3734 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3735 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3736 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3737 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3738 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3739 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3740 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3741 #undef SHOW_FUNCTION
3742
3743 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
3744 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
3745 {                                                                       \
3746         struct cfq_data *cfqd = e->elevator_data;                       \
3747         unsigned int __data;                                            \
3748         int ret = cfq_var_store(&__data, (page), count);                \
3749         if (__data < (MIN))                                             \
3750                 __data = (MIN);                                         \
3751         else if (__data > (MAX))                                        \
3752                 __data = (MAX);                                         \
3753         if (__CONV)                                                     \
3754                 *(__PTR) = msecs_to_jiffies(__data);                    \
3755         else                                                            \
3756                 *(__PTR) = __data;                                      \
3757         return ret;                                                     \
3758 }
3759 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3760 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3761                 UINT_MAX, 1);
3762 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3763                 UINT_MAX, 1);
3764 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3765 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3766                 UINT_MAX, 0);
3767 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3768 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
3769 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3770 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3771 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3772                 UINT_MAX, 0);
3773 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3774 #undef STORE_FUNCTION
3775
3776 #define CFQ_ATTR(name) \
3777         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3778
3779 static struct elv_fs_entry cfq_attrs[] = {
3780         CFQ_ATTR(quantum),
3781         CFQ_ATTR(fifo_expire_sync),
3782         CFQ_ATTR(fifo_expire_async),
3783         CFQ_ATTR(back_seek_max),
3784         CFQ_ATTR(back_seek_penalty),
3785         CFQ_ATTR(slice_sync),
3786         CFQ_ATTR(slice_async),
3787         CFQ_ATTR(slice_async_rq),
3788         CFQ_ATTR(slice_idle),
3789         CFQ_ATTR(group_idle),
3790         CFQ_ATTR(low_latency),
3791         __ATTR_NULL
3792 };
3793
3794 static struct elevator_type iosched_cfq = {
3795         .ops = {
3796                 .elevator_merge_fn =            cfq_merge,
3797                 .elevator_merged_fn =           cfq_merged_request,
3798                 .elevator_merge_req_fn =        cfq_merged_requests,
3799                 .elevator_allow_merge_fn =      cfq_allow_merge,
3800                 .elevator_bio_merged_fn =       cfq_bio_merged,
3801                 .elevator_dispatch_fn =         cfq_dispatch_requests,
3802                 .elevator_add_req_fn =          cfq_insert_request,
3803                 .elevator_activate_req_fn =     cfq_activate_request,
3804                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
3805                 .elevator_completed_req_fn =    cfq_completed_request,
3806                 .elevator_former_req_fn =       elv_rb_former_request,
3807                 .elevator_latter_req_fn =       elv_rb_latter_request,
3808                 .elevator_init_icq_fn =         cfq_init_icq,
3809                 .elevator_exit_icq_fn =         cfq_exit_icq,
3810                 .elevator_set_req_fn =          cfq_set_request,
3811                 .elevator_put_req_fn =          cfq_put_request,
3812                 .elevator_may_queue_fn =        cfq_may_queue,
3813                 .elevator_init_fn =             cfq_init_queue,
3814                 .elevator_exit_fn =             cfq_exit_queue,
3815         },
3816         .icq_size       =       sizeof(struct cfq_io_cq),
3817         .icq_align      =       __alignof__(struct cfq_io_cq),
3818         .elevator_attrs =       cfq_attrs,
3819         .elevator_name  =       "cfq",
3820         .elevator_owner =       THIS_MODULE,
3821 };
3822
3823 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3824 static struct blkio_policy_type blkio_policy_cfq = {
3825         .ops = {
3826                 .blkio_alloc_group_fn =         cfq_alloc_blkio_group,
3827                 .blkio_link_group_fn =          cfq_link_blkio_group,
3828                 .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
3829                 .blkio_clear_queue_fn = cfq_clear_queue,
3830                 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
3831         },
3832         .plid = BLKIO_POLICY_PROP,
3833 };
3834 #endif
3835
3836 static int __init cfq_init(void)
3837 {
3838         int ret;
3839
3840         /*
3841          * could be 0 on HZ < 1000 setups
3842          */
3843         if (!cfq_slice_async)
3844                 cfq_slice_async = 1;
3845         if (!cfq_slice_idle)
3846                 cfq_slice_idle = 1;
3847
3848 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3849         if (!cfq_group_idle)
3850                 cfq_group_idle = 1;
3851 #else
3852                 cfq_group_idle = 0;
3853 #endif
3854         cfq_pool = KMEM_CACHE(cfq_queue, 0);
3855         if (!cfq_pool)
3856                 return -ENOMEM;
3857
3858         ret = elv_register(&iosched_cfq);
3859         if (ret) {
3860                 kmem_cache_destroy(cfq_pool);
3861                 return ret;
3862         }
3863
3864 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3865         blkio_policy_register(&blkio_policy_cfq);
3866 #endif
3867         return 0;
3868 }
3869
3870 static void __exit cfq_exit(void)
3871 {
3872 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3873         blkio_policy_unregister(&blkio_policy_cfq);
3874 #endif
3875         elv_unregister(&iosched_cfq);
3876         kmem_cache_destroy(cfq_pool);
3877 }
3878
3879 module_init(cfq_init);
3880 module_exit(cfq_exit);
3881
3882 MODULE_AUTHOR("Jens Axboe");
3883 MODULE_LICENSE("GPL");
3884 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");