Documentation/cgroups/blkio-controller.txt | 106 +++ block/Kconfig | 22 + block/Kconfig.iosched | 17 + block/Makefile | 1 + block/blk-cgroup.c | 343 ++++++++ block/blk-cgroup.h | 67 ++ block/cfq-iosched.c | 1187 ++++++++++++++++++++++----- include/linux/cgroup_subsys.h | 6 + include/linux/iocontext.h | 4 + 9 files changed, 1533 insertions(+), 220 deletions(-) diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt new file mode 100644 index 0000000..dc8fb1a --- /dev/null +++ b/Documentation/cgroups/blkio-controller.txt @@ -0,0 +1,106 @@ + Block IO Controller + =================== +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kind of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in storage hierarchy. Plan +is to use same cgroup based management interface for blkio controller and +based on user options switch IO policies in the background. + +In the first phase, this patchset implements proportional weight time based +division of disk policy. It is implemented in CFQ. Hence this policy takes +effect only on leaf nodes when CFQ is being used. + +HOWTO +===== +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable group scheduling in CFQ + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio). + + mount -t cgroup -o blkio none /cgroup + +- Create two cgroups + mkdir -p /cgroup/test1/ /cgroup/test2 + +- Set weights of group test1 and test2 + echo 1000 > /cgroup/test1/blkio.weight + echo 500 > /cgroup/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /cgroup/test1/tasks + cat /cgroup/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /cgroup/test2/tasks + cat /cgroup/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milli seconds), each group got and how many secotors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Various user visible config options +=================================== +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_DEBUG_CFQ_IOSCHED + - Enables some debugging messages in blktrace. Also creates extra + cgroup file blkio.dequeue. + +Config options selected automatically +===================================== +These config options are not user visible and are selected/deselected +automatically based on IO scheduler configuration. + +CONFIG_BLK_CGROUP + - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. + +Details of cgroup files +======================= +- blkio.ioprio_class + - Specifies class of the cgroup (RT, BE, IDLE). This is default io + class of the group on all the devices. + + 1 = RT; 2 = BE, 3 = IDLE + +- blkio.weight + - Specifies per cgroup weight. + + Currently allowed range of weights is from 100 to 1000. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. diff --git a/block/Kconfig b/block/Kconfig index 9be0b56..e20fbde 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_CGROUP + bool + depends on CGROUPS + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. + +config DEBUG_BLK_CGROUP + bool + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it stores the cgroup path + in the blk group which can be used by cfq for tracing various + group related activity. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 7e803fc..9c5f0b5 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -40,6 +40,23 @@ config IOSCHED_CFQ working environment, suitable for desktop systems. This is the default I/O scheduler. +config CFQ_GROUP_IOSCHED + bool "CFQ Group Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select BLK_CGROUP + default n + ---help--- + Enable group IO scheduling in CFQ. + +config DEBUG_CFQ_IOSCHED + bool "Debug CFQ Scheduling" + depends on CFQ_GROUP_IOSCHED + select DEBUG_BLK_CGROUP + default n + ---help--- + Enable CFQ IO scheduling debugging in CFQ. Currently it makes + blktrace output more verbose. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index ba74ca6..16334c9 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c new file mode 100644 index 0000000..6a46156 --- /dev/null +++ b/block/blk-cgroup.c @@ -0,0 +1,343 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ +#include +#include +#include +#include "blk-cgroup.h" + +extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); +extern void cfq_update_blkio_group_ioprio_class(struct blkio_group *, + unsigned short); +extern void cfq_delink_blkio_group(void *, struct blkio_group *); + +struct blkio_cgroup blkio_root_cgroup = { + .weight = BLKIO_WEIGHT_DEFAULT, + .ioprio_class = IOPRIO_CLASS_BE, +}; + +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} + +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ + blkg->time += time; + blkg->sectors += sectors; +} + +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + spin_unlock_irqrestore(&blkcg->lock, flags); +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); +#endif + blkg->dev = dev; +} + +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} + +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (!css) + goto out; + + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); +out: + rcu_read_unlock(); + return ret; +} + +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +{ + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct blkio_cgroup *blkcg; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + return (u64)blkcg->__VAR; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +static int +blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct hlist_node *n; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + cfq_update_blkio_group_weight(blkg, blkcg->weight); + spin_unlock_irq(&blkcg->lock); + return 0; +} + +static int blkiocg_ioprio_class_write(struct cgroup *cgroup, + struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct hlist_node *n; + + if (val < IOPRIO_CLASS_RT || val > IOPRIO_CLASS_IDLE) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + blkcg->ioprio_class = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + cfq_update_blkio_group_weight(blkg, blkcg->weight); + spin_unlock_irq(&blkcg->lock); + return 0; +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype, struct seq_file *m) \ +{ \ + struct blkio_cgroup *blkcg; \ + struct blkio_group *blkg; \ + struct hlist_node *n; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + rcu_read_lock(); \ + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ + if (blkg->dev) \ + seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ + MINOR(blkg->dev), blkg->__VAR); \ + } \ + rcu_read_unlock(); \ + cgroup_unlock(); \ + return 0; \ +} + +SHOW_FUNCTION_PER_GROUP(time); +SHOW_FUNCTION_PER_GROUP(sectors); +#ifdef CONFIG_DEBUG_BLK_CGROUP +SHOW_FUNCTION_PER_GROUP(dequeue); +#endif +#undef SHOW_FUNCTION_PER_GROUP + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->dequeue += dequeue; +} +#endif + +struct cftype blkio_files[] = { + { + .name = "weight", + .read_u64 = blkiocg_weight_read, + .write_u64 = blkiocg_weight_write, + }, + { + .name = "ioprio_class", + .read_u64 = blkiocg_ioprio_class_read, + .write_u64 = blkiocg_ioprio_class_write, + }, + { + .name = "time", + .read_seq_string = blkiocg_time_read, + }, + { + .name = "sectors", + .read_seq_string = blkiocg_sectors_read, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "dequeue", + .read_seq_string = blkiocg_dequeue_read, + }, +#endif +}; + +static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + + rcu_read_lock(); +remove_entry: + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + goto done; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); + + spin_unlock_irqrestore(&blkcg->lock, flags); + + /* + * This blkio_group is being delinked as associated cgroup is going + * away. Let all the IO controlling policies know about this event. + * + * Currently this is static call to one io controlling policy. Once + * we have more policies in place, we need some dynamic registration + * of callback function. + */ + cfq_delink_blkio_group(key, blkg); + goto remove_entry; +done: + free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); + + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg, *parent_blkcg; + + if (!cgroup->parent) { + blkcg = &blkio_root_cgroup; + goto done; + } + + /* Currently we do not support hierarchy deeper than two level (0,1) */ + parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); + if (css_depth(&parent_blkcg->css) > 0) + return ERR_PTR(-EINVAL); + + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + blkcg->weight = BLKIO_WEIGHT_DEFAULT; + blkcg->ioprio_class = IOPRIO_CLASS_BE; + + return &blkcg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkiocg_can_attach(struct cgroup_subsys *subsys, + struct cgroup *cgroup, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, + .subsys_id = blkio_subsys_id, + .use_id = 1, +}; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h new file mode 100644 index 0000000..4ca101d --- /dev/null +++ b/block/blk-cgroup.h @@ -0,0 +1,67 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include + +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + unsigned short ioprio_class; + spinlock_t lock; + struct hlist_head blkg_list; +}; + +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; + unsigned short blkcg_id; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Store cgroup path */ + char path[128]; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; +#endif + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + + /* total disk time and nr sectors dispatched by this group */ + unsigned long time; + unsigned long sectors; +}; + +#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_DEBUG_BLK_CGROUP +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); +#else +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +static inline void blkiocg_update_blkio_group_dequeue_stats( + struct blkio_group *blkg, unsigned long dequeue) {} +#endif + +extern struct blkio_cgroup blkio_root_cgroup; +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev); +int blkiocg_del_blkio_group(struct blkio_group *blkg); +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 069a610..f23d713 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -12,6 +12,7 @@ #include #include #include +#include "blk-cgroup.h" /* * tunables @@ -28,6 +29,9 @@ static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +#define IO_IOPRIO_CLASSES 3 +#define CFQ_SERVICE_SHIFT 12 + /* * offset from end of service tree */ @@ -38,7 +42,7 @@ static int cfq_slice_idle = HZ / 125; */ #define CFQ_MIN_TT (2) -#define CFQ_SLICE_SCALE (5) +#define CFQ_SLICE_SCALE (500) #define CFQ_HW_QUEUE_MIN (5) #define RQ_CIC(rq) \ @@ -53,8 +57,10 @@ static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); #define CFQ_PRIO_LISTS IOPRIO_BE_NR -#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) +#define cfqe_class_idle(cfqe) ((cfqe)->ioprio_class == IOPRIO_CLASS_IDLE) +#define cfqe_class_rt(cfqe) ((cfqe)->ioprio_class == IOPRIO_CLASS_RT) +#define cfq_class_idle(cfqq) (cfqe_class_idle(&(cfqq)->entity)) +#define cfq_class_rt(cfqq) (cfqe_class_rt(&(cfqq)->entity)) #define sample_valid(samples) ((samples) > 80) @@ -64,26 +70,43 @@ static DEFINE_SPINLOCK(ioc_gone_lock); * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should * move this into the elevator for the rq sorting as well. */ -struct cfq_rb_root { +struct cfq_service_tree { struct rb_root rb; struct rb_node *left; + u64 min_vdisktime; + struct cfq_entity *active; +}; +#define CFQ_RB_ROOT (struct cfq_service_tree) { RB_ROOT, NULL, 0, NULL} + +struct cfq_sched_data { + unsigned int nr_active; + struct cfq_service_tree service_tree[IO_IOPRIO_CLASSES]; +}; + +struct cfq_entity { + struct rb_node rb_node; + u64 vdisktime; + unsigned int weight; + struct cfq_service_tree *st; + unsigned short ioprio_class; + bool ioprio_class_changed; + struct cfq_entity *parent; + bool on_st; + /* Points to the sched_data of group entity. Null for cfqq */ + struct cfq_sched_data *my_sd; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } /* * Per process-grouping structure */ struct cfq_queue { + struct cfq_entity entity; /* reference count */ atomic_t ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; /* prio tree member */ struct rb_node p_node; /* prio tree root we belong to, if any */ @@ -99,8 +122,9 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; - long slice_resid; unsigned int slice_dispatch; /* pending metadata requests */ @@ -110,9 +134,22 @@ struct cfq_queue { /* io prio of this group */ unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short org_ioprio_class; pid_t pid; + /* Sectors dispatched in current dispatch round */ + unsigned long nr_sectors; +}; + +/* Per cgroup grouping structure */ +struct cfq_group { + struct cfq_entity entity; + struct cfq_sched_data sched_data; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct blkio_group blkg; + struct hlist_node cfqd_node; + atomic_t ref; +#endif }; /* @@ -120,11 +157,7 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; + struct cfq_group root_group; /* * Each priority tree is sorted by next_request position. These @@ -183,6 +216,9 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_end_sync_rq; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; }; enum cfqq_state_flags { @@ -224,8 +260,29 @@ CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&cfqq_to_cfqg((cfqq))->blkg), ##args); + +#define cfq_log_cfqe(cfqd, cfqe, fmt, args...) \ + if (cfqq_of(cfqe)) { \ + struct cfq_queue *cfqq = cfqq_of(cfqe); \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, \ + (cfqq)->pid, cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&cfqq_to_cfqg((cfqq))->blkg), ##args);\ + } else { \ + struct cfq_group *cfqg = cfqg_of(cfqe); \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + } +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqe(cfqd, cfqe, fmt, args...) +#endif + #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -234,6 +291,407 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); +static void cfq_put_queue(struct cfq_queue *cfqq); +static struct cfq_entity *__cfq_get_next_entity(struct cfq_service_tree *st); + +static inline struct cfq_entity *parent_entity(struct cfq_entity *cfqe) +{ + return cfqe->parent; +} + +static inline struct cfq_queue *cfqq_of(struct cfq_entity *cfqe) +{ + if (!cfqe->my_sd) + return container_of(cfqe, struct cfq_queue, entity); + return NULL; +} + +static inline struct cfq_group *cfqg_of(struct cfq_entity *cfqe) +{ + if (cfqe->my_sd) + return container_of(cfqe, struct cfq_group, entity); + return NULL; +} + +static inline void +init_cfqe_service_tree(struct cfq_entity *cfqe, struct cfq_entity *p_cfqe) +{ + struct cfq_group *p_cfqg = cfqg_of(p_cfqe); + unsigned short idx = cfqe->ioprio_class - 1; + + /* + * ioprio class of the entity has not been initialized yet, don't + * init service tree right now. This can happen in the case of + * oom_cfqq which will inherit its class and prio once first request + * gets queued in and at that point of time prio update will make + * sure that service tree gets initialized before queue gets onto + * tree. + */ + if (cfqe->ioprio_class == IOPRIO_CLASS_NONE) + return; + + BUG_ON(idx >= IO_IOPRIO_CLASSES); + cfqe->st = &p_cfqg->sched_data.service_tree[idx]; +} + +static inline s64 +cfqe_key(struct cfq_service_tree *st, struct cfq_entity *cfqe) +{ + return cfqe->vdisktime - st->min_vdisktime; +} + +static inline u64 +cfq_delta(u64 service, unsigned int numerator_wt, unsigned int denominator_wt) +{ + if (numerator_wt != denominator_wt) { + service = service * numerator_wt; + do_div(service, denominator_wt); + } + + return service; +} + +static inline u64 +cfq_delta_fair(unsigned long delta, struct cfq_entity *cfqe) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + return cfq_delta(d, BLKIO_WEIGHT_DEFAULT, cfqe->weight); +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_service_tree *st) +{ + u64 vdisktime = st->min_vdisktime; + + if (st->active) + vdisktime = st->active->vdisktime; + + if (st->left) { + struct cfq_entity *cfqe = rb_entry(st->left, struct cfq_entity, + rb_node); + + vdisktime = min_vdisktime(vdisktime, cfqe->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +static inline unsigned int cfq_ioprio_to_weight(int ioprio) +{ + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + /* Map prio 7 - 0 to weights 200 to 900 */ + return BLKIO_WEIGHT_DEFAULT + (BLKIO_WEIGHT_DEFAULT/5 * (4 - ioprio)); +} + +static inline int +cfq_weight_slice(struct cfq_data *cfqd, int sync, unsigned int weight) +{ + const int base_slice = cfqd->cfq_slice[sync]; + + WARN_ON(weight > BLKIO_WEIGHT_MAX); + + return cfq_delta(base_slice, weight, BLKIO_WEIGHT_DEFAULT); +} + +/* + * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end + * isn't valid until the first request from the dispatch is activated + * and the slice time set. + */ +static inline bool cfq_slice_used(struct cfq_queue *cfqq) +{ + if (cfq_cfqq_slice_new(cfqq)) + return 0; + if (time_before(jiffies, cfqq->slice_end)) + return 0; + + return 1; +} + +static inline void +cfq_init_cfqe_parent(struct cfq_entity *cfqe, struct cfq_entity *p_cfqe) +{ + cfqe->parent = p_cfqe; + init_cfqe_service_tree(cfqe, p_cfqe); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +/* check for entity->parent so that loop is not executed for root entity. */ +#define for_each_entity(entity) \ + for (; entity && entity->parent; entity = entity->parent) + +#define cfqe_is_cfqq(cfqe) (!(cfqe)->my_sd) + +static inline bool cfqq_should_wait_busy(struct cfq_queue *cfqq) +{ + if (!RB_EMPTY_ROOT(&cfqq->sort_list) || !cfq_cfqq_idle_window(cfqq)) + return false; + + if (cfqq->dispatched && !cfq_slice_used(cfqq)) + return false; + + return true; +} + +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +static inline struct cfq_sched_data * +cfq_entity_sched_data(struct cfq_entity *cfqe) +{ + return &cfqg_of(parent_entity(cfqe))->sched_data; +} + +static inline struct cfq_group *cfqq_to_cfqg(struct cfq_queue *cfqq) +{ + return cfqg_of(parent_entity(&cfqq->entity)); +} + +static inline void cfq_get_cfqg_ref(struct cfq_group *cfqg) +{ + atomic_inc(&cfqg->ref); +} + +static void cfq_init_cfqg(struct cfq_group *cfqg, struct blkio_cgroup *blkcg) +{ + struct cfq_entity *cfqe = &cfqg->entity; + + cfqe->weight = blkcg->weight; + cfqe->ioprio_class = blkcg->ioprio_class; + cfqe->ioprio_class_changed = 1; + cfqe->my_sd = &cfqg->sched_data; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + unsigned int major, minor; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + + /* Do we need to take this reference */ + if (!css_tryget(&blkcg->css)) + return NULL;; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg || !create) + goto done; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC | __GFP_ZERO, + cfqd->queue->node); + if (!cfqg) + goto done; + + cfq_init_cfqg(cfqg, blkcg); + cfq_init_cfqe_parent(&cfqg->entity, &cfqd->root_group.entity); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + cfq_get_cfqg_ref(cfqg); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + +done: + css_put(&blkcg->css); + return cfqg; +} + +/* + * Search for the cfq group current task belongs to. If create = 1, then also + * create the cfq group if it does not exist. + * Should be called under request queue lock. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + struct cgroup *cgroup; + struct cfq_group *cfqg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + if (!cfqg && create) + cfqg = &cfqd->root_group; + rcu_read_unlock(); + return cfqg; +} + +void +cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct cfq_group *cfqg = cfqg_of_blkg(blkg); + + cfqg->entity.weight = weight; +} + +void cfq_update_blkio_group_ioprio_class(struct blkio_group *blkg, + unsigned short ioprio_class) +{ + struct cfq_group *cfqg = cfqg_of_blkg(blkg); + + cfqg->entity.ioprio_class = ioprio_class; + smp_wmb(); + cfqg->entity.ioprio_class_changed = 1; +} + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_service_tree *st; + int i; + + BUG_ON(atomic_read(&cfqg->ref) <= 0); + if (!atomic_dec_and_test(&cfqg->ref)) + return; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = cfqg->sched_data.service_tree + i; + BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + BUG_ON(st->active != NULL); + } + + kfree(cfqg); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) +{ + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +void cfq_delink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct cfq_data *cfqd = key; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + +static void cfq_update_cfqq_stats(struct cfq_queue *cfqq, + unsigned long slice_used) +{ + struct cfq_entity *cfqe = &cfqq->entity; + + for_each_entity(cfqe) { + struct cfq_group *cfqg = cfqg_of(parent_entity(cfqe)); + blkiocg_update_blkio_group_stats(&cfqg->blkg, slice_used, + cfqq->nr_sectors); + } +} + +#else /* CONFIG_CFQ_GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define cfqe_is_cfqq(cfqe) 1 + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_get_cfqg_ref(struct cfq_group *cfqg) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + +static inline bool cfqq_should_wait_busy(struct cfq_queue *cfqq) +{ + return false; +} + +static inline struct cfq_data *cfqd_of(struct cfq_entity *cfqe) +{ + return cfqq_of(cfqe)->cfqd; +} + +static inline struct cfq_sched_data * +cfq_entity_sched_data(struct cfq_entity *cfqe) +{ + struct cfq_data *cfqd = cfqd_of(cfqe); + + return &cfqd->root_group.sched_data; +} + +static inline struct cfq_group *cfqq_to_cfqg(struct cfq_queue *cfqq) +{ + return &cfqq->cfqd->root_group; +} + +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} + +static inline void cfq_update_cfqq_stats(struct cfq_queue *cfqq, + unsigned long slice_used) {} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ static inline int rq_in_driver(struct cfq_data *cfqd) { @@ -277,53 +735,24 @@ static int cfq_queue_empty(struct request_queue *q) { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; -} - -/* - * Scale schedule slice based on io priority. Use the sync time slice only - * if a queue is marked sync and has sync io queued. A sync queue with async - * io only, should not get full sync slice length. - */ -static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, - unsigned short prio) -{ - const int base_slice = cfqd->cfq_slice[sync]; - - WARN_ON(prio >= IOPRIO_BE_NR); - - return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); + return !cfqd->rq_queued; } static inline int cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); + return cfq_weight_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->entity.weight); } static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + cfqq->slice_start = jiffies; cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } /* - * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end - * isn't valid until the first request from the dispatch is activated - * and the slice time set. - */ -static inline bool cfq_slice_used(struct cfq_queue *cfqq) -{ - if (cfq_cfqq_slice_new(cfqq)) - return 0; - if (time_before(jiffies, cfqq->slice_end)) - return 0; - - return 1; -} - -/* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closest to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. @@ -419,33 +848,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) } /* - * The below is leftmost cache rbtree addon - */ -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) -{ - if (!root->left) - root->left = rb_first(&root->rb); - - if (root->left) - return rb_entry(root->left, struct cfq_queue, rb_node); - - return NULL; -} - -static void rb_erase_init(struct rb_node *n, struct rb_root *root) -{ - rb_erase(n, root); - RB_CLEAR_NODE(n); -} - -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) -{ - if (root->left == n) - root->left = NULL; - rb_erase_init(n, &root->rb); -} - -/* * would be nice to take fifo expire time into account as well */ static struct request * @@ -472,102 +874,243 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, return cfq_choose_req(cfqd, next, prev); } -static unsigned long cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) +static void +place_cfqe(struct cfq_service_tree *st, struct cfq_entity *cfqe, int add_front) +{ + u64 vdisktime = st->min_vdisktime; + struct rb_node *parent; + struct cfq_entity *__cfqe; + + if (cfqe_class_idle(cfqe)) { + vdisktime = CFQ_IDLE_DELAY; + parent = rb_last(&st->rb); + if (parent && parent != &cfqe->rb_node) { + __cfqe = rb_entry(parent, struct cfq_entity, rb_node); + vdisktime += __cfqe->vdisktime; + } else + vdisktime += st->min_vdisktime; + } else if (!add_front) { + parent = rb_last(&st->rb); + if (parent && parent != &cfqe->rb_node) { + __cfqe = rb_entry(parent, struct cfq_entity, rb_node); + vdisktime = __cfqe->vdisktime; + } + } + + cfqe->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +static inline void cfqe_update_ioprio_class(struct cfq_entity *cfqe) { + if (unlikely(cfqe->ioprio_class_changed)) { + /* + * Re-initialize the service tree pointer as ioprio class + * change will lead to service tree change. + */ + init_cfqe_service_tree(cfqe, parent_entity(cfqe)); + cfqe->ioprio_class_changed = 0; + cfqe->vdisktime = 0; + } +} + +static void __dequeue_cfqe(struct cfq_service_tree *st, struct cfq_entity *cfqe) +{ + /* Node is not on tree */ + if (RB_EMPTY_NODE(&cfqe->rb_node)) + return; + + if (st->left == &cfqe->rb_node) + st->left = rb_next(&cfqe->rb_node); + + rb_erase(&cfqe->rb_node, &st->rb); + RB_CLEAR_NODE(&cfqe->rb_node); +} + +static void dequeue_cfqe(struct cfq_entity *cfqe) +{ + struct cfq_service_tree *st = cfqe->st; + struct cfq_sched_data *sd = cfq_entity_sched_data(cfqe); + + if (st->active == cfqe) + st->active = NULL; + + __dequeue_cfqe(st, cfqe); + sd->nr_active--; + cfqe->on_st = 0; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfqe_is_cfqq(cfqe)) + blkiocg_update_blkio_group_dequeue_stats(&cfqg_of(cfqe)->blkg, + 1); +#endif +} + +static void dequeue_cfqq(struct cfq_queue *cfqq) +{ + struct cfq_entity *cfqe = &cfqq->entity; + + for_each_entity(cfqe) { + struct cfq_sched_data *sd = cfq_entity_sched_data(cfqe); + + dequeue_cfqe(cfqe); + if (!cfqe_is_cfqq(cfqe)) { + cfq_log_cfqe(cfqq->cfqd, cfqe, "del_from_rr group"); + } + + /* Do not dequeue parent if it has other entities under it */ + if (sd->nr_active) + break; + } +} + +static void __enqueue_cfqe(struct cfq_service_tree *st, struct cfq_entity *cfqe, + int add_front) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_entity *__cfqe; + s64 key = cfqe_key(st, cfqe); + int leftmost = 1; + + while (*node != NULL) { + parent = *node; + __cfqe = rb_entry(parent, struct cfq_entity, rb_node); + + if (key < cfqe_key(st, __cfqe) || + (add_front && (key == cfqe_key(st, __cfqe)))) { + node = &parent->rb_left; + } else { + node = &parent->rb_right; + leftmost = 0; + } + } + /* - * just an approximation, should be ok. + * Maintain a cache of leftmost tree entries (it is frequently + * used) */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); + if (leftmost) + st->left = &cfqe->rb_node; + + rb_link_node(&cfqe->rb_node, parent, node); + rb_insert_color(&cfqe->rb_node, &st->rb); } -/* - * The cfqd->service_tree holds all pending cfq_queue's that have - * requests waiting to be processed. It is sorted in the order that - * we will service the queues. - */ -static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool add_front) +static void enqueue_cfqe(struct cfq_entity *cfqe) { - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - unsigned long rb_key; - int left; - - if (cfq_class_idle(cfqq)) { - rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); - if (parent && parent != &cfqq->rb_node) { - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - rb_key += __cfqq->rb_key; - } else - rb_key += jiffies; - } else if (!add_front) { - /* - * Get our rb key offset. Subtract any residual slice - * value carried from last service. A negative resid - * count indicates slice overrun, and this should position - * the next service time further away in the tree. - */ - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; - rb_key -= cfqq->slice_resid; - cfqq->slice_resid = 0; - } else { - rb_key = -HZ; - __cfqq = cfq_rb_first(&cfqd->service_tree); - rb_key += __cfqq ? __cfqq->rb_key : jiffies; + struct cfq_sched_data *sd = cfq_entity_sched_data(cfqe); + + cfqe->on_st = 1; + sd->nr_active++; + cfqe_update_ioprio_class(cfqe); + place_cfqe(cfqe->st, cfqe, 0); + __enqueue_cfqe(cfqe->st, cfqe, 0); +} + +static void enqueue_cfqq(struct cfq_queue *cfqq) +{ + struct cfq_entity *cfqe = &cfqq->entity; + + for_each_entity(cfqe) { + if (cfqe->on_st) + break; + enqueue_cfqe(cfqe); } +} - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - /* - * same position, nothing more to do - */ - if (rb_key == cfqq->rb_key) - return; +/* Requeue a cfqq which is already on the service tree */ +static void requeue_cfqe(struct cfq_entity *cfqe, int add_front) +{ + struct cfq_service_tree *st = cfqe->st; + struct cfq_entity *next_cfqe; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + if (add_front) { + next_cfqe = __cfq_get_next_entity(st); + if (next_cfqe && next_cfqe == cfqe) + return; } - left = 1; - parent = NULL; - p = &cfqd->service_tree.rb.rb_node; - while (*p) { - struct rb_node **n; + __dequeue_cfqe(st, cfqe); + place_cfqe(st, cfqe, add_front); + __enqueue_cfqe(st, cfqe, add_front); +} - parent = *p; - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); +static void requeue_cfqq(struct cfq_queue *cfqq, int add_front) +{ + requeue_cfqe(&cfqq->entity, add_front); +} + +static void cfqe_served(struct cfq_entity *cfqe, unsigned long served) +{ + struct cfq_data *cfqd = cfqq_of(cfqe)->cfqd; + for_each_entity(cfqe) { /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. + * Can't update entity disk time while it is on sorted rb-tree + * as vdisktime is used as key. */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (time_before(rb_key, __cfqq->rb_key)) - n = &(*p)->rb_left; - else - n = &(*p)->rb_right; + __dequeue_cfqe(cfqe->st, cfqe); + cfqe->vdisktime += cfq_delta_fair(served, cfqe); + update_min_vdisktime(cfqe->st); + __enqueue_cfqe(cfqe->st, cfqe, 0); + cfq_log_cfqe(cfqd, cfqe, "served: vt=%llx min_vt=%llx", + cfqe->vdisktime, cfqe->st->min_vdisktime); + + /* If entity prio class has changed, take that into account */ + if (unlikely(cfqe->ioprio_class_changed)) { + dequeue_cfqe(cfqe); + enqueue_cfqe(cfqe); + } + } +} - if (n == &(*p)->rb_right) - left = 0; +static void cfqq_served(struct cfq_queue *cfqq, unsigned long served) +{ + /* + * We don't want to charge more than allocated slice otherwise this + * queue can miss one dispatch round doubling max latencies. On the + * other hand we don't want to charge less than allocated slice as + * we stick to CFQ theme of queue loosing its share if it does not + * use the slice and moves to the back of service tree (almost). + */ + served = cfq_prio_to_slice(cfqq->cfqd, cfqq); + cfqe_served(&cfqq->entity, served); +} - p = n; +/* + * Handles three operations. + * Addition of a new queue to service tree, when a new request comes in. + * Resorting of an expiring queue (used after slice expired) + * Requeuing a queue at the front (used during preemption). + */ +static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, + bool add_front, unsigned long service) +{ + if (RB_EMPTY_NODE(&cfqq->entity.rb_node)) { + /* Its a new queue. Add it to service tree */ + enqueue_cfqq(cfqq); + return; } - if (left) - cfqd->service_tree.left = &cfqq->rb_node; + if (service) { + /* + * This queue just got served. Compute the new key and requeue + * in the service tree + */ + cfqq_served(cfqq, service); + + /* + * Requeue async ioq so that these will be again placed at the + * end of service tree giving a chance to sync queues. + * TODO: Handle this case in a better manner. + */ + if (!cfq_cfqq_sync(cfqq)) + requeue_cfqq(cfqq, 0); + return; + } - cfqq->rb_key = rb_key; - rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + /* Just requeuing an existing queue, used during preemption */ + requeue_cfqq(cfqq, add_front); } static struct cfq_queue * @@ -634,13 +1177,14 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * Update cfqq's position in the service tree. */ -static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq, + unsigned long service) { /* * Resorting requires the cfqq to be on the RR list already. */ if (cfq_cfqq_on_rr(cfqq)) { - cfq_service_tree_add(cfqd, cfqq, 0); + cfq_service_tree_add(cfqd, cfqq, 0, service); cfq_prio_tree_add(cfqd, cfqq); } } @@ -656,7 +1200,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - cfq_resort_rr_list(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq, 0); } /* @@ -669,8 +1213,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + dequeue_cfqq(cfqq); if (cfqq->p_root) { rb_erase(&cfqq->p_node, cfqq->p_root); cfqq->p_root = NULL; @@ -686,7 +1229,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -694,8 +1236,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -846,6 +1397,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, struct cfq_io_context *cic; struct cfq_queue *cfqq; + /* Deny merge if bio and rq don't belong to same cfq group */ + if (cfqq_to_cfqg(RQ_CFQQ(rq)) != cfq_get_cfqg(cfqd, 0)) + return false; /* * Disallow merge of a sync bio into an async request. */ @@ -869,8 +1423,10 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, { if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfqq->slice_start = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -888,10 +1444,11 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, * current cfqq expired its slice (or was too idle), select new one */ static void -__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool timed_out) +__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); + long slice_used = 0; + + cfq_log_cfqq(cfqd, cfqq, "slice expired"); if (cfq_cfqq_wait_request(cfqq)) del_timer(&cfqd->idle_slice_timer); @@ -899,14 +1456,21 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); /* - * store what was left of this slice, if the queue idled/timed out + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) { - cfqq->slice_resid = cfqq->slice_end - jiffies; - cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); - } + if (!cfqq->slice_end || cfqq->slice_start == jiffies) + slice_used = 1; + else + slice_used = jiffies - cfqq->slice_start; - cfq_resort_rr_list(cfqd, cfqq); + cfq_log_cfqq(cfqd, cfqq, "sl_used=%ld", slice_used); + cfq_update_cfqq_stats(cfqq, slice_used); + + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + + cfq_resort_rr_list(cfqd, cfqq, slice_used); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; @@ -917,12 +1481,40 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, } } -static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) +static inline void cfq_slice_expired(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; if (cfqq) - __cfq_slice_expired(cfqd, cfqq, timed_out); + __cfq_slice_expired(cfqd, cfqq); +} + +static struct cfq_entity *__cfq_get_next_entity(struct cfq_service_tree *st) +{ + struct rb_node *left = st->left; + + if (!left) + return NULL; + + return rb_entry(left, struct cfq_entity, rb_node); +} + +static struct cfq_entity *cfq_get_next_entity(struct cfq_sched_data *sd) +{ + struct cfq_service_tree *st = sd->service_tree; + struct cfq_entity *cfqe = NULL; + int i; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) { + cfqe = __cfq_get_next_entity(st); + if (cfqe) { + st->active = cfqe; + update_min_vdisktime(cfqe->st); + break; + } + } + + return cfqe; } /* @@ -931,10 +1523,20 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + struct cfq_entity *cfqe = NULL; + struct cfq_sched_data *sd; + + if (!cfqd->rq_queued) return NULL; - return cfq_rb_first(&cfqd->service_tree); + sd = &cfqd->root_group.sched_data; + for (; sd ; sd = cfqe->my_sd) { + cfqe = cfq_get_next_entity(sd); + if (!cfqe) + return NULL; + } + + return cfqq_of(cfqe); } /* @@ -1049,6 +1651,10 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (!cfqq) return NULL; + /* If new queue belongs to different cfq_group, don't choose it */ + if (cfqq_to_cfqg(cur_cfqq) != cfqq_to_cfqg(cfqq)) + return NULL; + if (cfq_cfqq_coop(cfqq)) return NULL; @@ -1057,19 +1663,22 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, return cfqq; } -static void cfq_arm_slice_timer(struct cfq_data *cfqd) +static bool cfq_arm_slice_timer(struct cfq_data *cfqd, int reset) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; unsigned long sl; + /* If idle timer is already armed, nothing to do */ + if (!reset && timer_pending(&cfqd->idle_slice_timer)) + return true; /* * SSD device without seek penalty, disable idling. But only do so * for devices that support queuing, otherwise we still have a problem * with sync vs async workloads. */ if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) - return; + return false; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); WARN_ON(cfq_cfqq_slice_new(cfqq)); @@ -1078,29 +1687,14 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * idle is disabled, either manually or by past process history */ if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) - return; - - /* - * still requests with the driver, don't idle - */ - if (rq_in_driver(cfqd)) - return; + return false; /* * task has exited, don't wait */ cic = cfqd->active_cic; if (!cic || !atomic_read(&cic->ioc->nr_tasks)) - return; - - /* - * If our average think time is larger than the remaining time - * slice, then don't idle. This avoids overrunning the allotted - * time slice. - */ - if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) - return; + return false; cfq_mark_cfqq_wait_request(cfqq); @@ -1114,7 +1708,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu reset=%d", sl, reset); + return true; } /* @@ -1134,6 +1729,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight++; + cfqq->nr_sectors += blk_rq_sectors(rq); } /* @@ -1181,6 +1777,15 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) if (!cfqq) goto new_queue; + if (!cfqd->rq_queued) + return NULL; + + /* Wait for a queue to get busy before we expire it */ + if (cfqq_should_wait_busy(cfqq) && cfq_arm_slice_timer(cfqd, 0)) { + cfqq = NULL; + goto keep_queue; + } + /* * The active queue has run out of time, expire it and select new. */ @@ -1216,7 +1821,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) } expire: - cfq_slice_expired(cfqd, 0); + cfq_slice_expired(cfqd); new_queue: cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: @@ -1233,6 +1838,10 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) } BUG_ON(!list_empty(&cfqq->fifo)); + + /* By default cfqq is not expired if it is empty. Do it explicitly */ + __cfq_slice_expired(cfqq->cfqd, cfqq); + return dispatched; } @@ -1245,10 +1854,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = cfq_get_next_queue(cfqd)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); - cfq_slice_expired(cfqd, 0); + cfq_slice_expired(cfqd); BUG_ON(cfqd->busy_queues); @@ -1391,7 +2000,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || cfq_class_idle(cfqq))) { cfqq->slice_end = jiffies + 1; - cfq_slice_expired(cfqd, 0); + cfq_slice_expired(cfqd); } cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); @@ -1402,11 +2011,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) * task holds one reference to the queue, dropped when task exits. each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * + * Each cfq queue took a reference on the parent group. Drop it now. * queue lock must be held here. */ static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; + struct cfq_group *cfqg; BUG_ON(atomic_read(&cfqq->ref) <= 0); @@ -1416,14 +2027,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq) cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfqg = cfqq_to_cfqg(cfqq); if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0); + __cfq_slice_expired(cfqd, cfqq); cfq_schedule_dispatch(cfqd); } + BUG_ON(cfq_cfqq_on_rr(cfqq)); + kmem_cache_free(cfq_pool, cfqq); + cfq_put_cfqg(cfqg); } /* @@ -1514,7 +2128,7 @@ static void cfq_free_io_context(struct io_context *ioc) static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); + __cfq_slice_expired(cfqd, cfqq); cfq_schedule_dispatch(cfqd); } @@ -1617,29 +2231,33 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) * no prio set, inherit CPU scheduling settings */ cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = task_nice_ioclass(tsk); + cfqq->entity.ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_RT; + cfqq->entity.ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_BE; + cfqq->entity.ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: - cfqq->ioprio_class = IOPRIO_CLASS_IDLE; + cfqq->entity.ioprio_class = IOPRIO_CLASS_IDLE; cfqq->ioprio = 7; cfq_clear_cfqq_idle_window(cfqq); break; } + cfqq->entity.weight = cfq_ioprio_to_weight(cfqq->ioprio); + + if (cfqq->org_ioprio_class != cfqq->entity.ioprio_class) + cfqq->entity.ioprio_class_changed = 1; /* * keep track of original prio settings in case we have to temporarily * elevate the priority of this queue */ cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; + cfqq->org_ioprio_class = cfqq->entity.ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -1681,7 +2299,7 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc) static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, pid_t pid, bool is_sync) { - RB_CLEAR_NODE(&cfqq->rb_node); + RB_CLEAR_NODE(&cfqq->entity.rb_node); RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); @@ -1698,14 +2316,65 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->pid = pid; } +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + cfqq->entity.my_sd = NULL; + + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfq_init_cfqe_parent(&cfqq->entity, &cfqg->entity); + + /* cfqq reference on cfqg */ + cfq_get_cfqg_ref(cfqg); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + if (sync_cfqq) { + /* + * Drop reference to sync queue. A new sync queue will be + * assigned in new group upon arrival of a fresh request. + */ + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct cfq_group *cfqg; retry: + cfqg = cfq_get_cfqg(cfqd, 1); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -1736,6 +2405,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else cfqq = &cfqd->oom_cfqq; @@ -1927,6 +2597,10 @@ out: if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -2016,6 +2690,36 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, } } +static bool cfq_should_preempt_group(struct cfq_data *cfqd, + struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) +{ + struct cfq_entity *cfqe = &cfqq->entity; + struct cfq_entity *new_cfqe = &new_cfqq->entity; + + if (cfqq_to_cfqg(cfqq) != &cfqd->root_group) + cfqe = parent_entity(&cfqq->entity); + + if (cfqq_to_cfqg(new_cfqq) != &cfqd->root_group) + new_cfqe = parent_entity(&new_cfqq->entity); + + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + + if (new_cfqe->ioprio_class == IOPRIO_CLASS_RT + && cfqe->ioprio_class != IOPRIO_CLASS_RT) + return true; + /* + * Allow an BE request to pre-empt an ongoing IDLE clas timeslice. + */ + + if (new_cfqe->ioprio_class == IOPRIO_CLASS_BE + && cfqe->ioprio_class == IOPRIO_CLASS_IDLE) + return true; + + return false; +} + /* * Check if new_cfqq should preempt the currently active queue. Return 0 for * no or if we aren't sure, a 1 will cause a preempt. @@ -2046,6 +2750,9 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; + if (cfqq_to_cfqg(new_cfqq) != cfqq_to_cfqg(cfqq)) + return cfq_should_preempt_group(cfqd, cfqq, new_cfqq); + /* * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. @@ -2079,7 +2786,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); + cfq_slice_expired(cfqd); /* * Put the new queue at the front of the of the current list, @@ -2087,7 +2794,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) */ BUG_ON(!cfq_cfqq_on_rr(cfqq)); - cfq_service_tree_add(cfqd, cfqq, 1); + cfq_service_tree_add(cfqd, cfqq, 1, 0); cfqq->slice_end = 0; cfq_mark_cfqq_slice_new(cfqq); @@ -2129,8 +2836,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->busy_queues > 1) { del_timer(&cfqd->idle_slice_timer); __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); + } else + cfq_mark_cfqq_must_dispatch(cfqq); } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -2229,10 +2936,13 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * of idling. */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); + if (!cfqq_should_wait_busy(cfqq)) + cfq_slice_expired(cfqd); + else + cfq_arm_slice_timer(cfqd, 1); else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd, 1); } if (!rq_in_driver(cfqd)) @@ -2250,16 +2960,20 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) * boost idle prio on transactions that would lock out other * users of the filesystem */ - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; + if (cfq_class_idle(cfqq)) { + cfqq->entity.ioprio_class = IOPRIO_CLASS_BE; + cfqq->entity.ioprio_class_changed = 1; + } if (cfqq->ioprio > IOPRIO_NORM) cfqq->ioprio = IOPRIO_NORM; } else { /* * check if we need to unboost the queue */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; + if (cfqq->entity.ioprio_class != cfqq->org_ioprio_class) { + cfqq->entity.ioprio_class = cfqq->org_ioprio_class; + cfqq->entity.ioprio_class_changed = 1; + } if (cfqq->ioprio != cfqq->org_ioprio) cfqq->ioprio = cfqq->org_ioprio; } @@ -2391,7 +3105,6 @@ static void cfq_idle_slice_timer(unsigned long data) struct cfq_data *cfqd = (struct cfq_data *) data; struct cfq_queue *cfqq; unsigned long flags; - int timed_out = 1; cfq_log(cfqd, "idle timer fired"); @@ -2399,7 +3112,6 @@ static void cfq_idle_slice_timer(unsigned long data) cfqq = cfqd->active_queue; if (cfqq) { - timed_out = 0; /* * We saw a request before the queue expired, let it through @@ -2427,7 +3139,7 @@ static void cfq_idle_slice_timer(unsigned long data) goto out_kick; } expire: - cfq_slice_expired(cfqd, timed_out); + cfq_slice_expired(cfqd); out_kick: cfq_schedule_dispatch(cfqd); out_cont: @@ -2455,6 +3167,36 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) cfq_put_queue(cfqd->async_idle_cfqq); } +static void cfq_init_root_group(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = &cfqd->root_group; + int i; + + cfqg->entity.parent = NULL; + cfqg->entity.my_sd = &cfqg->sched_data; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + cfqg->sched_data.service_tree[i] = CFQ_RB_ROOT; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + atomic_set(&cfqg->ref, 0); + /* + * Take a reference to root group which we never drop. This is just + * to make sure that cfq_put_cfqg() does not try to kfree root group + */ + cfq_get_cfqg_ref(cfqg); + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, + 0); +#endif +} + +static void cfq_exit_root_group(struct cfq_data *cfqd) +{ +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkiocg_del_blkio_group(&cfqd->root_group.blkg); +#endif +} + static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -2465,7 +3207,7 @@ static void cfq_exit_queue(struct elevator_queue *e) spin_lock_irq(q->queue_lock); if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0); + __cfq_slice_expired(cfqd, cfqd->active_queue); while (!list_empty(&cfqd->cic_list)) { struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, @@ -2477,10 +3219,14 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + cfq_exit_root_group(cfqd); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ + synchronize_rcu(); kfree(cfqd); } @@ -2493,7 +3239,7 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; + cfq_init_root_group(cfqd); /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -2510,6 +3256,7 @@ static void *cfq_init_queue(struct request_queue *q) */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); atomic_inc(&cfqd->oom_cfqq.ref); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..ccefff0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_BLK_CGROUP +SUBSYS(blkio) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 4da4a75..5357d5c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -73,6 +73,10 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_BLK_CGROUP + unsigned short cgroup_changed; +#endif + /* * For request batching */