Documentation/cgroups/blkio-controller.txt | 100 +++ block/Kconfig | 22 + block/Kconfig.iosched | 17 + block/Makefile | 1 + block/blk-cgroup.c | 312 ++++++++++ block/blk-cgroup.h | 90 +++ block/cfq-iosched.c | 901 ++++++++++++++++++++++++---- include/linux/cgroup_subsys.h | 6 + include/linux/iocontext.h | 4 + 9 files changed, 1346 insertions(+), 107 deletions(-) diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt new file mode 100644 index 0000000..6fb772f --- /dev/null +++ b/Documentation/cgroups/blkio-controller.txt @@ -0,0 +1,100 @@ + Block IO Controller + =================== +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kind of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in storage hierarchy. Plan +is to use same cgroup based management interface for blkio controller and +based on user options switch IO policies in the background. + +In the first phase, this patchset implements proportional weight time based +division of disk policy. It is implemented in CFQ. Hence this policy takes +effect only on leaf nodes when CFQ is being used. + +HOWTO +===== +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable group scheduling in CFQ + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio). + + mount -t cgroup -o blkio none /cgroup + +- Create two cgroups + mkdir -p /cgroup/test1/ /cgroup/test2 + +- Set weights of group test1 and test2 + echo 1000 > /cgroup/test1/blkio.weight + echo 500 > /cgroup/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /cgroup/test1/tasks + cat /cgroup/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /cgroup/test2/tasks + cat /cgroup/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milli seconds), each group got and how many secotors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Various user visible config options +=================================== +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_DEBUG_CFQ_IOSCHED + - Enables some debugging messages in blktrace. Also creates extra + cgroup file blkio.dequeue. + +Config options selected automatically +===================================== +These config options are not user visible and are selected/deselected +automatically based on IO scheduler configuration. + +CONFIG_BLK_CGROUP + - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. + +Details of cgroup files +======================= +- blkio.weight + - Specifies per cgroup weight. + + Currently allowed range of weights is from 100 to 1000. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. diff --git a/block/Kconfig b/block/Kconfig index 9be0b56..e20fbde 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_CGROUP + bool + depends on CGROUPS + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. + +config DEBUG_BLK_CGROUP + bool + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it stores the cgroup path + in the blk group which can be used by cfq for tracing various + group related activity. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 8bd1051..b71abfb 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -32,6 +32,23 @@ config IOSCHED_CFQ This is the default I/O scheduler. +config CFQ_GROUP_IOSCHED + bool "CFQ Group Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select BLK_CGROUP + default n + ---help--- + Enable group IO scheduling in CFQ. + +config DEBUG_CFQ_IOSCHED + bool "Debug CFQ Scheduling" + depends on CFQ_GROUP_IOSCHED + select DEBUG_BLK_CGROUP + default n + ---help--- + Enable CFQ IO scheduling debugging in CFQ. Currently it makes + blktrace output more verbose. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index 7914108..cb2d515 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c new file mode 100644 index 0000000..179ddfa --- /dev/null +++ b/block/blk-cgroup.c @@ -0,0 +1,312 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ +#include +#include +#include +#include "blk-cgroup.h" + +extern void cfq_unlink_blkio_group(void *, struct blkio_group *); +extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); + +struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; + +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} + +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ + blkg->time += time; + blkg->sectors += sectors; +} + +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + spin_unlock_irqrestore(&blkcg->lock, flags); +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); +#endif + blkg->dev = dev; +} + +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} + +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (!css) + goto out; + + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); +out: + rcu_read_unlock(); + return ret; +} + +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +{ + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct blkio_cgroup *blkcg; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + return (u64)blkcg->__VAR; \ +} + +SHOW_FUNCTION(weight); +#undef SHOW_FUNCTION + +static int +blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct hlist_node *n; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + cfq_update_blkio_group_weight(blkg, blkcg->weight); + spin_unlock_irq(&blkcg->lock); + return 0; +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype, struct seq_file *m) \ +{ \ + struct blkio_cgroup *blkcg; \ + struct blkio_group *blkg; \ + struct hlist_node *n; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + rcu_read_lock(); \ + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ + if (blkg->dev) \ + seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ + MINOR(blkg->dev), blkg->__VAR); \ + } \ + rcu_read_unlock(); \ + cgroup_unlock(); \ + return 0; \ +} + +SHOW_FUNCTION_PER_GROUP(time); +SHOW_FUNCTION_PER_GROUP(sectors); +#ifdef CONFIG_DEBUG_BLK_CGROUP +SHOW_FUNCTION_PER_GROUP(dequeue); +#endif +#undef SHOW_FUNCTION_PER_GROUP + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->dequeue += dequeue; +} +#endif + +struct cftype blkio_files[] = { + { + .name = "weight", + .read_u64 = blkiocg_weight_read, + .write_u64 = blkiocg_weight_write, + }, + { + .name = "time", + .read_seq_string = blkiocg_time_read, + }, + { + .name = "sectors", + .read_seq_string = blkiocg_sectors_read, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "dequeue", + .read_seq_string = blkiocg_dequeue_read, + }, +#endif +}; + +static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + + rcu_read_lock(); +remove_entry: + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + goto done; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); + + spin_unlock_irqrestore(&blkcg->lock, flags); + + /* + * This blkio_group is being unlinked as associated cgroup is going + * away. Let all the IO controlling policies know about this event. + * + * Currently this is static call to one io controlling policy. Once + * we have more policies in place, we need some dynamic registration + * of callback function. + */ + cfq_unlink_blkio_group(key, blkg); + goto remove_entry; +done: + free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg, *parent_blkcg; + + if (!cgroup->parent) { + blkcg = &blkio_root_cgroup; + goto done; + } + + /* Currently we do not support hierarchy deeper than two level (0,1) */ + parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); + if (css_depth(&parent_blkcg->css) > 0) + return ERR_PTR(-EINVAL); + + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); + + blkcg->weight = BLKIO_WEIGHT_DEFAULT; +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + + return &blkcg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkiocg_can_attach(struct cgroup_subsys *subsys, + struct cgroup *cgroup, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, + .subsys_id = blkio_subsys_id, + .use_id = 1, +}; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h new file mode 100644 index 0000000..b24ab71 --- /dev/null +++ b/block/blk-cgroup.h @@ -0,0 +1,90 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include + +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + spinlock_t lock; + struct hlist_head blkg_list; +}; + +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; + unsigned short blkcg_id; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Store cgroup path */ + char path[128]; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; +#endif + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + + /* total disk time and nr sectors dispatched by this group */ + unsigned long time; + unsigned long sectors; +}; + +#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_DEBUG_BLK_CGROUP +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); +#else +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +static inline void blkiocg_update_blkio_group_dequeue_stats( + struct blkio_group *blkg, unsigned long dequeue) {} +#endif + +#ifdef CONFIG_BLK_CGROUP +extern struct blkio_cgroup blkio_root_cgroup; +extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev); +extern int blkiocg_del_blkio_group(struct blkio_group *blkg); +extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, + void *key); +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors); +#else +static inline struct blkio_cgroup * +cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } + +static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) +{ +} + +static inline int +blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } + +static inline struct blkio_group * +blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ +} +#endif +#endif /* _BLK_CGROUP_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1bcbd8c..4e42b2e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -13,6 +13,7 @@ #include #include #include +#include "blk-cgroup.h" /* * tunables @@ -49,6 +50,7 @@ static const int cfq_hist_divisor = 4; #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) +#define CFQ_SERVICE_SHIFT 12 #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) @@ -77,8 +79,10 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; + u64 min_vdisktime; + struct rb_node *active; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } /* * Per process-grouping structure @@ -109,6 +113,10 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when queue got scheduled in to dispatch first request. */ + unsigned long dispatch_start; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; long slice_resid; unsigned int slice_dispatch; @@ -132,6 +140,9 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; + struct cfq_group *cfqg; + /* Sectors dispatched in current dispatch round */ + unsigned long nr_sectors; }; /* @@ -153,25 +164,57 @@ enum wl_type_t { SYNC_WORKLOAD = 2 }; +/* This is per cgroup per device grouping structure */ +struct cfq_group { + /* group service_tree member */ + struct rb_node rb_node; -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; + /* group service_tree key */ + u64 vdisktime; + unsigned int weight; + bool on_st; + /* number of cfqq currently on this group */ + int nr_cfqq; + + /* Per group busy queus average. Useful for workload slice calc. */ + unsigned int busy_queues_avg[2]; /* * rr lists of queues with requests, onle rr for each priority class. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; + + unsigned long saved_workload_slice; + enum wl_type_t saved_workload; + enum wl_prio_t saved_serving_prio; + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; + atomic_t ref; +#endif +}; + +/* + * Per block device queue structure + */ +struct cfq_data { + struct request_queue *queue; + + /* Root service tree for cfq_groups */ + struct cfq_rb_root grp_service_tree; + struct cfq_group root_group; + /* Number of active cfq groups on group service tree */ + int nr_groups; + /* * The priority currently being served */ enum wl_prio_t serving_prio; enum wl_type_t serving_type; unsigned long workload_expires; + struct cfq_group *serving_group; /* * Each priority tree is sorted by next_request position. These @@ -181,7 +224,6 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - unsigned int busy_queues_avg[2]; int rq_in_driver[2]; int sync_flight; @@ -222,6 +264,7 @@ struct cfq_data { unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_latency; + unsigned int cfq_group_idle; struct list_head cic_list; @@ -231,16 +274,25 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_end_sync_rq; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; }; -static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); + +static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, + enum wl_prio_t prio, enum wl_type_t type, struct cfq_data *cfqd) { + if (!cfqg) + return NULL; + if (prio == IDLE_WORKLOAD) - return &cfqd->service_tree_idle; + return &cfqg->service_tree_idle; - return &cfqd->service_trees[prio][type]; + return &cfqg->service_trees[prio][type]; } enum cfqq_state_flags { @@ -282,8 +334,21 @@ CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args); + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do{}while(0); +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -306,14 +371,16 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) return SYNC_WORKLOAD; } -static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) +static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + struct cfq_data *cfqd, + struct cfq_group *cfqg) { if (wl == IDLE_WORKLOAD) - return cfqd->service_tree_idle.count; + return cfqg->service_tree_idle.count; - return cfqd->service_trees[wl][ASYNC_WORKLOAD].count - + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count - + cfqd->service_trees[wl][SYNC_WORKLOAD].count; + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); @@ -364,7 +431,7 @@ static int cfq_queue_empty(struct request_queue *q) { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; + return !cfqd->rq_queued; } /* @@ -388,24 +455,70 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + d = d * BLKIO_WEIGHT_DEFAULT; + do_div(d, cfqg->weight); + return d; +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_rb_root *st) +{ + u64 vdisktime = st->min_vdisktime; + struct cfq_group *cfqg; + + if (st->active) { + cfqg = rb_entry(st->active, struct cfq_group, rb_node); + vdisktime = cfqg->vdisktime; + } + + if (st->left) { + cfqg = rb_entry(st->left, struct cfq_group, rb_node); + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + /* * get averaged number of queues of RT/BE priority. * average is updated, with a formula that gives more weight to higher numbers, * to quickly follows sudden increases and decrease slowly */ -static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) +static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg, bool rt) { unsigned min_q, max_q; unsigned mult = cfq_hist_divisor - 1; unsigned round = cfq_hist_divisor / 2; - unsigned busy = cfq_busy_queues_wl(rt, cfqd); + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); - min_q = min(cfqd->busy_queues_avg[rt], busy); - max_q = max(cfqd->busy_queues_avg[rt], busy); - cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + min_q = min(cfqg->busy_queues_avg[rt], busy); + max_q = max(cfqg->busy_queues_avg[rt], busy); + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / cfq_hist_divisor; - return cfqd->busy_queues_avg[rt]; + return cfqg->busy_queues_avg[rt]; } static inline void @@ -415,10 +528,13 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (cfqd->cfq_latency) { /* interested queues (we consider only the ones with the same * priority class) */ - unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq)); + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, + cfq_class_rt(cfqq)); unsigned sync_slice = cfqd->cfq_slice[1]; unsigned expect_latency = sync_slice * iq; - if (expect_latency > cfq_target_latency) { + unsigned group_target_lat = cfq_target_latency/cfqd->nr_groups; + + if (expect_latency > group_target_lat) { unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; /* scale low_slice according to IO priority * and sync vs async */ @@ -426,10 +542,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) min(slice, base_low_slice * slice / sync_slice); /* the adapted slice value is scaled to fit all iqs * into the target latency */ - slice = max(slice * cfq_target_latency / expect_latency, + slice = max(slice * group_target_lat / expect_latency, low_slice); } } + cfqq->slice_start = jiffies; cfqq->slice_end = jiffies + slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -556,6 +673,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) return NULL; } +static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry(root->left, struct cfq_group, rb_node); + + return NULL; +} + static void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); @@ -607,6 +735,381 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } +static inline s64 +cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + return cfqg->vdisktime - st->min_vdisktime; +} + +static void +__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_group *__cfqg; + s64 key = cfqg_key(st, cfqg); + int left = 1; + + while (*node != NULL) { + parent = *node; + __cfqg = rb_entry(parent, struct cfq_group, rb_node); + + if (key < cfqg_key(st, __cfqg)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &cfqg->rb_node; + + rb_link_node(&cfqg->rb_node, parent, node); + rb_insert_color(&cfqg->rb_node, &st->rb); +} + +static void +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *__cfqg; + struct rb_node *n; + + cfqg->nr_cfqq++; + if (cfqg->on_st) + return; + + /* + * Currently put the group at the end. Later implement something + * so that groups get lesser vtime based on their weights, so that + * if group does not loose all if it was not continously backlogged. + */ + n = rb_last(&st->rb); + if (n) { + __cfqg = rb_entry(n, struct cfq_group, rb_node); + cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + } else + cfqg->vdisktime = st->min_vdisktime; + + __cfq_group_service_tree_add(st, cfqg); + cfqg->on_st = true; + cfqd->nr_groups++; +} + +static void +cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + if (st->active == &cfqg->rb_node) + st->active = NULL; + + BUG_ON(cfqg->nr_cfqq < 1); + cfqg->nr_cfqq--; + + /* If there are other cfq queues under this group, don't delete it */ + if (cfqg->nr_cfqq) + return; + + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); + cfqg->on_st = false; + cfqd->nr_groups--; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->saved_workload_slice = 0; + blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); +} + +static inline unsigned long cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +{ + unsigned long slice_used, allocated_slice; + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. + */ + if (!cfqq->slice_end || cfqq->slice_start == jiffies) { + /* + * Also charge the seek time incurred to the group, otherwise + * if there are mutiple queues in the group, each can dispatch + * a single request on seeky media and cause lots of seek time + * and group will never know it. + */ + slice_used = max_t(unsigned long, + (jiffies - cfqq->dispatch_start), 1); + } else { + slice_used = jiffies - cfqq->slice_start; + allocated_slice = cfqq->slice_end - cfqq->slice_start; + if (slice_used > allocated_slice) + slice_used = allocated_slice; + } + + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%lu sect=%lu", slice_used, + cfqq->nr_sectors); + return slice_used; +} + +static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, + unsigned long service, unsigned long sectors) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + /* Can't update vdisktime while group is on service tree */ + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->vdisktime += cfq_scale_slice(service, cfqg); + __cfq_group_service_tree_add(st, cfqg); + + /* This group is being expired. Save the context */ + if (time_after(cfqd->workload_expires, jiffies)) { + cfqg->saved_workload_slice = cfqd->workload_expires + - jiffies; + cfqg->saved_workload = cfqd->serving_type; + cfqg->saved_serving_prio = cfqd->serving_prio; + } else + cfqg->saved_workload_slice = 0; + + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, + st->min_vdisktime); + blkiocg_update_blkio_group_stats(&cfqg->blkg, service, sectors); +} + +/* + * Determine whether we should enforce idle window for this queue. + */ + +static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + enum wl_prio_t prio = cfqq_prio(cfqq); + struct cfq_rb_root *service_tree = cfqq->service_tree; + + /* We never do for idle class queues. */ + if (prio == IDLE_WORKLOAD) + return false; + + /* We do for queues that were marked with idle window flag. */ + if (cfq_cfqq_idle_window(cfqq)) + return true; + + /* + * Otherwise, we do only if they are the last ones + * in their service tree. + */ + if (!service_tree) + service_tree = service_tree_for(cfqq->cfqg, prio, + cfqq_type(cfqq), cfqd); + + if (service_tree->count == 0) + return true; + + return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +void +cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + cfqg_of_blkg(blkg)->weight = weight; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + int i, j; + unsigned int major, minor; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + + /* Do we need to take this reference */ + if (!css_tryget(&blkcg->css)) + return NULL;; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg || !create) + goto done; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + goto done; + + cfqg->weight = blkcg->weight; + + for (i = 0; i < 2; ++i) + for (j = 0; j < 3; ++j) + cfqg->service_trees[i][j] = CFQ_RB_ROOT; + cfqg->service_tree_idle = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&cfqg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + +done: + css_put(&blkcg->css); + return cfqg; +} + +/* + * Search for the cfq group current task belongs to. If create = 1, then also + * create the cfq group if it does not exist. request_queue lock must be held. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + struct cgroup *cgroup; + struct cfq_group *cfqg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + if (!cfqg && create) + cfqg = &cfqd->root_group; + rcu_read_unlock(); + return cfqg; +} + +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfqq->cfqg = cfqg; + /* cfqq reference on cfqg */ + atomic_inc(&cfqq->cfqg->ref); +} + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_rb_root *st; + int i, j; + + BUG_ON(atomic_read(&cfqg->ref) <= 0); + if (!atomic_dec_and_test(&cfqg->ref)) + return; + + for (i = 0; i < 2; ++i) + for (j = 0; j < 3; ++j) { + st = &cfqg->service_trees[i][j]; + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + BUG_ON(st->active != NULL); + } + + BUG_ON(!RB_EMPTY_ROOT(&cfqg->service_tree_idle.rb)); + BUG_ON(cfqg->service_tree_idle.active != NULL); + + kfree(cfqg); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) +{ + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct cfq_data *cfqd = key; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +static inline bool cfqq_should_wait_busy(struct cfq_queue *cfqq) +{ + /* Group idling is disabled */ + if (!cfqq->cfqd->cfq_group_idle) + return false; + + /* cfqq group still has got more requests to dispatch */ + if (!RB_EMPTY_ROOT(&cfqq->sort_list) || cfqq->cfqg->nr_cfqq > 1) + return false; + + if (!cfq_should_idle(cfqq->cfqd, cfqq)) + return false; + + return true; +} + +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} +static inline void +cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { + cfqq->cfqg = cfqg; +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} +static inline bool cfqq_should_wait_busy(struct cfq_queue *cfqq) +{ + return false; +} + +#endif /* GROUP_IOSCHED */ + /* * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that @@ -620,8 +1123,10 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, unsigned long rb_key; struct cfq_rb_root *service_tree; int left; + int new_cfqq = 1; - service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), + cfqq_type(cfqq), cfqd); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; parent = rb_last(&service_tree->rb); @@ -647,6 +1152,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, } if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + new_cfqq = 0; /* * same position, nothing more to do */ @@ -688,6 +1194,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; + if (add_front || !new_cfqq) + return; + cfq_group_service_tree_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -798,6 +1307,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -808,7 +1319,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -816,8 +1326,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -971,6 +1490,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, struct cfq_io_context *cic; struct cfq_queue *cfqq; + /* Deny merge if bio and rq don't belong to same cfq group */ + if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) + return false; /* * Disallow merge of a sync bio into an async request. */ @@ -994,8 +1516,11 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, { if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfqq->slice_start = 0; + cfqq->dispatch_start = jiffies; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -1031,11 +1556,20 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + cfq_group_served(cfqd, cfqq->cfqg, cfq_cfqq_slice_usage(cfqq), + cfqq->nr_sectors); + + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) + cfqd->grp_service_tree.active = NULL; + if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -1054,16 +1588,50 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) * Get next queue for service. Unless we have a queue preemption, * we'll simply select the first cfqq in the service tree. */ -static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) +static struct cfq_queue *__cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_rb_root *service_tree = - service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd); + service_tree_for(cfqd->serving_group, cfqd->serving_prio, + cfqd->serving_type, cfqd); + if (!cfqd->rq_queued) + return NULL; + + /* There is nothing to dispatch */ + if (!service_tree) + return NULL; if (RB_EMPTY_ROOT(&service_tree->rb)) return NULL; return cfq_rb_first(service_tree); } +static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg; + struct cfq_queue *cfqq; + int i, j; + + if (!cfqd->rq_queued) + return NULL; + + cfqg = cfq_get_next_cfqg(cfqd); + if (!cfqg) + return NULL; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 3; ++j) { + cfqq = cfq_rb_first(&cfqg->service_trees[i][j]); + if (cfqq) + return cfqq; + } + } + + cfqq = cfq_rb_first(&cfqg->service_tree_idle); + if (cfqq) + return cfqq; + return NULL; +} + /* * Get and set a new active queue for service. */ @@ -1071,7 +1639,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (!cfqq) - cfqq = cfq_get_next_queue(cfqd); + cfqq = __cfq_get_next_queue(cfqd); __cfq_set_active_queue(cfqd, cfqq); return cfqq; @@ -1170,6 +1738,10 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (!cfqq) return NULL; + /* If new queue belongs to different cfq_group, don't choose it */ + if (cur_cfqq->cfqg != cfqq->cfqg) + return NULL; + /* * It only makes sense to merge sync queues. */ @@ -1187,41 +1759,16 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, return cfqq; } -/* - * Determine whether we should enforce idle window for this queue. - */ - -static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - enum wl_prio_t prio = cfqq_prio(cfqq); - struct cfq_rb_root *service_tree = cfqq->service_tree; - - /* We never do for idle class queues. */ - if (prio == IDLE_WORKLOAD) - return false; - - /* We do for queues that were marked with idle window flag. */ - if (cfq_cfqq_idle_window(cfqq)) - return true; - - /* - * Otherwise, we do only if they are the last ones - * in their service tree. - */ - if (!service_tree) - service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd); - - if (service_tree->count == 0) - return true; - - return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq); -} - -static void cfq_arm_slice_timer(struct cfq_data *cfqd) +static bool cfq_arm_slice_timer(struct cfq_data *cfqd, int wait_busy) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; unsigned long sl; + struct cfq_rb_root *st; + + /* If idle timer is already armed, nothing to do */ + if (wait_busy && timer_pending(&cfqd->idle_slice_timer)) + return true; /* * SSD device without seek penalty, disable idling. But only do so @@ -1229,7 +1776,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * with sync vs async workloads. */ if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) - return; + return false; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); WARN_ON(cfq_cfqq_slice_new(cfqq)); @@ -1238,29 +1785,29 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * idle is disabled, either manually or by past process history */ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) - return; + return false; /* * still requests with the driver, don't idle */ - if (rq_in_driver(cfqd)) - return; + if (rq_in_driver(cfqd) && !wait_busy) + return false; /* * task has exited, don't wait */ cic = cfqd->active_cic; if (!cic || !atomic_read(&cic->ioc->nr_tasks)) - return; + return false; /* * If our average think time is larger than the remaining time * slice, then don't idle. This avoids overrunning the allotted * time slice. */ - if (sample_valid(cic->ttime_samples) && + if (!wait_busy && sample_valid(cic->ttime_samples) && (cfqq->slice_end - jiffies < cic->ttime_mean)) - return; + return false; cfq_mark_cfqq_wait_request(cfqq); @@ -1271,16 +1818,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * fair distribution of slice time for a process doing back-to-back * seeks. */ - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && - service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd) - ->count > 0) { + st = service_tree_for(cfqq->cfqg, cfqd->serving_prio, + SYNC_NOIDLE_WORKLOAD, cfqd); + if (!wait_busy && cfqd->serving_type == SYNC_NOIDLE_WORKLOAD + && st->count > 0) { if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag) - return; + return false; sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); } + if (wait_busy) + sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); + mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu wait_busy=%d", sl, wait_busy); + return true; } /* @@ -1300,6 +1852,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight++; + cfqq->nr_sectors += blk_rq_sectors(rq); } /* @@ -1381,8 +1934,9 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } } -static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, - bool prio_changed) +static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_prio_t prio, + bool prio_changed) { struct cfq_queue *queue; int i; @@ -1396,10 +1950,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, * from SYNC_NOIDLE (first choice), or just SYNC * over ASYNC */ - if (service_tree_for(prio, cur_best, cfqd)->count) + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) return cur_best; cur_best = SYNC_WORKLOAD; - if (service_tree_for(prio, cur_best, cfqd)->count) + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) return cur_best; return ASYNC_WORKLOAD; @@ -1407,7 +1961,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, for (i = 0; i < 3; ++i) { /* otherwise, select the one with lowest rb_key */ - queue = cfq_rb_first(service_tree_for(prio, i, cfqd)); + queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); if (queue && (!key_valid || time_before(queue->rb_key, lowest_key))) { lowest_key = queue->rb_key; @@ -1419,17 +1973,25 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, return cur_best; } -static void choose_service_tree(struct cfq_data *cfqd) +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) { enum wl_prio_t previous_prio = cfqd->serving_prio; bool prio_changed; unsigned slice; unsigned count; + struct cfq_rb_root *st; + int group_target_latency; + + if (!cfqg) { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } /* Choose next priority. RT > BE > IDLE */ - if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) cfqd->serving_prio = RT_WORKLOAD; - else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) cfqd->serving_prio = BE_WORKLOAD; else { cfqd->serving_prio = IDLE_WORKLOAD; @@ -1443,8 +2005,9 @@ static void choose_service_tree(struct cfq_data *cfqd) * expiration time */ prio_changed = (cfqd->serving_prio != previous_prio); - count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) - ->count; + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, + cfqd); + count = st->count; /* * If priority didn't change, check workload expiration, @@ -1456,18 +2019,21 @@ static void choose_service_tree(struct cfq_data *cfqd) /* otherwise select new workload type */ cfqd->serving_type = - cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed); - count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) - ->count; + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, + cfqd); + count = st->count; /* * the workload slice is computed as a fraction of target latency * proportional to the number of queues in that workload, over * all the queues in the same priority class */ - slice = cfq_target_latency * count / - max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio], - cfq_busy_queues_wl(cfqd->serving_prio, cfqd)); + group_target_latency = cfq_target_latency/cfqd->nr_groups; + + slice = group_target_latency * count / + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], + cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); if (cfqd->serving_type == ASYNC_WORKLOAD) /* async workload slice is scaled down according to @@ -1481,6 +2047,34 @@ static void choose_service_tree(struct cfq_data *cfqd) cfqd->workload_expires = jiffies + slice; } +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *cfqg; + + if (RB_EMPTY_ROOT(&st->rb)) + return NULL; + cfqg = cfq_rb_first_group(st); + st->active = &cfqg->rb_node; + update_min_vdisktime(st); + return cfqg; +} + +static void cfq_choose_cfqg(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); + + cfqd->serving_group = cfqg; + + /* Restore the workload type data */ + if (cfqg->saved_workload_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; + cfqd->serving_type = cfqg->saved_workload; + cfqd->serving_prio = cfqg->saved_serving_prio; + } + choose_service_tree(cfqd, cfqg); +} + /* * Select a queue for service. If we have a current active queue, * check whether to continue servicing it, or retrieve and set a new one. @@ -1493,6 +2087,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) if (!cfqq) goto new_queue; + if (!cfqd->rq_queued) + return NULL; + /* * The active queue has run out of time, expire it and select new. */ @@ -1531,6 +2128,16 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) } expire: + /* + * Wait for a group to get busy before we expire it. No wait + * is done for NCQ SSDs. Do a small wait of 2ms on rotational + * media in the hope that group will get backlogged again and + * not loose its fair share. + */ + if (cfqq_should_wait_busy(cfqq) && cfq_arm_slice_timer(cfqd, 1)) { + cfqq = NULL; + goto keep_queue; + } cfq_slice_expired(cfqd, 0); new_queue: /* @@ -1538,7 +2145,7 @@ new_queue: * service tree */ if (!new_cfqq) - choose_service_tree(cfqd); + cfq_choose_cfqg(cfqd); cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: @@ -1555,6 +2162,10 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) } BUG_ON(!list_empty(&cfqq->fifo)); + + /* By default cfqq is not expired if it is empty. Do it explicitly */ + __cfq_slice_expired(cfqq->cfqd, cfqq, 0); + return dispatched; } @@ -1566,14 +2177,8 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) { struct cfq_queue *cfqq; int dispatched = 0; - int i, j; - for (i = 0; i < 2; ++i) - for (j = 0; j < 3; ++j) - while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j])) - != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); cfq_slice_expired(cfqd, 0); @@ -1730,11 +2335,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) * task holds one reference to the queue, dropped when task exits. each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * + * Each cfq queue took a reference on the parent group. Drop it now. * queue lock must be held here. */ static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; + struct cfq_group *cfqg; BUG_ON(atomic_read(&cfqq->ref) <= 0); @@ -1744,14 +2351,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq) cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfqg = cfqq->cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + BUG_ON(cfq_cfqq_on_rr(cfqq)); + kmem_cache_free(cfq_pool, cfqq); + cfq_put_cfqg(cfqg); } /* @@ -2044,14 +2654,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->pid = pid; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + if (sync_cfqq) { + /* + * Drop reference to sync queue. A new sync queue will be + * assigned in new group upon arrival of a fresh request. + */ + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct cfq_group *cfqg; retry: + cfqg = cfq_get_cfqg(cfqd, 1); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -2082,6 +2729,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else cfqq = &cfqd->oom_cfqq; @@ -2273,6 +2921,10 @@ out: if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -2386,6 +3038,9 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (!cfqq) return false; + if (new_cfqq->cfqg != cfqq->cfqg) + return false; + if (cfq_slice_used(cfqq)) return true; @@ -2488,9 +3143,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); + __blk_run_queue(cfqd->queue); + } else + cfq_mark_cfqq_must_dispatch(cfqq); } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -2600,10 +3255,13 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * of idling. */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); + if (!cfqq_should_wait_busy(cfqq)) + cfq_slice_expired(cfqd, 0); + else + cfq_arm_slice_timer(cfqd, 1); else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) && sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd, 0); } if (!rq_in_driver(cfqd)) @@ -2902,11 +3560,15 @@ static void cfq_exit_queue(struct elevator_queue *e) } cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ + synchronize_rcu(); kfree(cfqd); } @@ -2914,16 +3576,36 @@ static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; int i, j; + struct cfq_group *cfqg; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + /* Init root service tree */ + cfqd->grp_service_tree = CFQ_RB_ROOT; + + /* Init root group */ + cfqg = &cfqd->root_group; + for (i = 0; i < 2; ++i) for (j = 0; j < 3; ++j) - cfqd->service_trees[i][j] = CFQ_RB_ROOT; - cfqd->service_tree_idle = CFQ_RB_ROOT; + cfqg->service_trees[i][j] = CFQ_RB_ROOT; + cfqg->service_tree_idle = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* Give preference to root group over other groups */ + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* + * Take a reference to root group which we never drop. This is just + * to make sure that cfq_put_cfqg() does not try to kfree root group + */ + atomic_set(&cfqg->ref, 1); + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, + 0); +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -2939,6 +3621,7 @@ static void *cfq_init_queue(struct request_queue *q) */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); atomic_inc(&cfqd->oom_cfqq.ref); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); @@ -2960,6 +3643,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_latency = 1; + cfqd->cfq_group_idle = 1; cfqd->hw_tag = 1; cfqd->last_end_sync_rq = jiffies; return cfqd; @@ -3030,6 +3714,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -3062,6 +3747,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -3078,6 +3764,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(group_idle), __ATTR_NULL }; diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..ccefff0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_BLK_CGROUP +SUBSYS(blkio) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index eb73632..d61b0b8 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -68,6 +68,10 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_BLK_CGROUP + unsigned short cgroup_changed; +#endif + /* * For request batching */