Documentation/block/00-INDEX | 2 + Documentation/block/io-controller.txt | 407 ++++ block/Kconfig.iosched | 89 + block/Makefile | 1 + block/as-iosched.c | 503 ++++- block/blk-core.c | 350 +++- block/blk-ioc.c | 33 +- block/blk-settings.c | 1 + block/blk-sysfs.c | 59 +- block/blk.h | 6 + block/cfq-iosched.c | 1179 ++++-------- block/deadline-iosched.c | 129 +- block/elevator-fq.c | 3365 +++++++++++++++++++++++++++++++++ block/elevator-fq.h | 644 +++++++ block/elevator.c | 217 ++- block/noop-iosched.c | 73 +- drivers/md/dm-table.c | 11 +- drivers/md/dm.c | 7 +- drivers/md/dm.h | 3 +- drivers/md/linear.c | 7 +- drivers/md/multipath.c | 7 +- drivers/md/raid0.c | 6 +- drivers/md/raid1.c | 9 +- drivers/md/raid10.c | 6 +- drivers/md/raid5.c | 2 +- fs/afs/write.c | 8 +- fs/btrfs/disk-io.c | 6 +- fs/btrfs/extent_io.c | 12 + fs/btrfs/volumes.c | 8 +- fs/buffer.c | 2 + fs/cifs/file.c | 11 + fs/direct-io.c | 2 + fs/ext2/ialloc.c | 2 +- fs/gfs2/aops.c | 12 + fs/nilfs2/segbuf.c | 3 +- fs/xfs/linux-2.6/xfs_aops.c | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 2 +- include/linux/backing-dev.h | 63 +- include/linux/biotrack.h | 100 + include/linux/blkdev.h | 66 +- include/linux/cgroup_subsys.h | 12 + include/linux/elevator.h | 91 +- include/linux/iocontext.h | 6 + include/linux/memcontrol.h | 6 + include/linux/mmzone.h | 4 +- include/linux/page_cgroup.h | 5 +- include/trace/events/block.h | 6 +- init/Kconfig | 24 + kernel/trace/blktrace.c | 6 +- mm/Makefile | 4 +- mm/backing-dev.c | 74 +- mm/biotrack.c | 293 +++ mm/bounce.c | 2 + mm/filemap.c | 2 + mm/memcontrol.c | 6 + mm/memory.c | 5 + mm/page-writeback.c | 13 + mm/page_cgroup.c | 23 +- mm/readahead.c | 2 +- mm/swap_state.c | 2 + 60 files changed, 6854 insertions(+), 1147 deletions(-) diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index 961a051..dc8bf95 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -10,6 +10,8 @@ capability.txt - Generic Block Device Capability (/sys/block//capability) deadline-iosched.txt - Deadline IO scheduler tunables +io-controller.txt + - IO controller for provding hierarchical IO scheduling ioprio.txt - Block io priorities (in CFQ scheduler) request.txt diff --git a/Documentation/block/io-controller.txt b/Documentation/block/io-controller.txt new file mode 100644 index 0000000..21948c3 --- /dev/null +++ b/Documentation/block/io-controller.txt @@ -0,0 +1,407 @@ + IO Controller + ============= + +Overview +======== + +This patchset implements a proportional weight IO controller. That is one +can create cgroups and assign prio/weights to those cgroups and task group +will get access to disk proportionate to the weight of the group. + +These patches modify elevator layer and individual IO schedulers to do +IO control hence this io controller works only on block devices which use +one of the standard io schedulers can not be used with any xyz logical block +device. + +The assumption/thought behind modifying IO scheduler is that resource control +is primarily needed on leaf nodes where the actual contention for resources is +present and not on intertermediate logical block devices. + +Consider following hypothetical scenario. Lets say there are three physical +disks, namely sda, sdb and sdc. Two logical volumes (lv0 and lv1) have been +created on top of these. Some part of sdb is in lv0 and some part is in lv1. + + lv0 lv1 + / \ / \ + sda sdb sdc + +Also consider following cgroup hierarchy + + root + / \ + A B + / \ / \ + T1 T2 T3 T4 + +A and B are two cgroups and T1, T2, T3 and T4 are tasks with-in those cgroups. +Assuming T1, T2, T3 and T4 are doing IO on lv0 and lv1. These tasks should +get their fair share of bandwidth on disks sda, sdb and sdc. There is no +IO control on intermediate logical block nodes (lv0, lv1). + +So if tasks T1 and T2 are doing IO on lv0 and T3 and T4 are doing IO on lv1 +only, there will not be any contetion for resources between group A and B if +IO is going to sda or sdc. But if actual IO gets translated to disk sdb, then +IO scheduler associated with the sdb will distribute disk bandwidth to +group A and B proportionate to their weight. + +CFQ already has the notion of fairness and it provides differential disk +access based on priority and class of the task. Just that it is flat and +with cgroup stuff, it needs to be made hierarchical to achive a good +hierarchical control on IO. + +Rest of the IO schedulers (noop, deadline and AS) don't have any notion +of fairness among various threads. They maintain only one queue where all +the IO gets queued (internally this queue is split in read and write queue +for deadline and AS). With this patchset, now we maintain one queue per +cgropu per device and then try to do fair queuing among those queues. + +One of the concerns raised with modifying IO schedulers was that we don't +want to replicate the code in all the IO schedulers. These patches share +the fair queuing code which has been moved to a common layer (elevator +layer). Hence we don't end up replicating code across IO schedulers. Following +diagram depicts the concept. + + -------------------------------- + | Elevator Layer + Fair Queuing | + -------------------------------- + | | | | + NOOP DEADLINE AS CFQ + +Design +====== +This patchset takes the inspiration from CFS cpu scheduler and CFQ to come +up with core of hierarchical scheduling. Like CFQ we give time slices to +every queue based on their priority. Like CFS, this disktime given to a +queue is converted to virtual disk time based on queue's weight (vdisktime) +and based on this vdisktime we decide which is the queue next to be +dispatched. + +From data structure point of view, one can think of a tree per device, where +io groups and io queues are hanging and are being scheduled using B-WF2Q+ +algorithm. io_queue, is end queue where requests are actually stored and +dispatched from (like cfqq). + +These io queues are primarily created by and managed by end io schedulers +depending on its semantics. For example, noop, deadline and AS ioschedulers +keep one io queues per cgroup and cfqq keeps one io queue per io_context in +a cgroup (apart from async queues). + +A request is mapped to an io group by elevator layer and which io queue it +is mapped to with in group depends on ioscheduler. Currently "current" task +is used to determine the cgroup (hence io group) of the request. Down the +line we need to make use of bio-cgroup patches to map delayed writes to +right group. + +Going back to old behavior +========================== +In new scheme of things essentially we are creating hierarchical fair +queuing logic in elevator layer and chaning IO schedulers to make use of +that logic so that end IO schedulers start supporting hierarchical scheduling. + +Elevator layer continues to support the old interfaces. So even if fair queuing +is enabled at elevator layer, one can have both new hierchical scheduler as +well as old non-hierarchical scheduler operating. + +Also noop, deadline and AS have option of enabling hierarchical scheduling. +If it is selected, fair queuing is done in hierarchical manner. If hierarchical +scheduling is disabled, noop, deadline and AS should retain their existing +behavior. + +CFQ is the only exception where one can not disable fair queuing as it is +needed for provding fairness among various threads even in non-hierarchical +mode. + +Various user visible config options +=================================== +CONFIG_IOSCHED_NOOP_HIER + - Enables hierchical fair queuing in noop. Not selecting this option + leads to old behavior of noop. + +CONFIG_IOSCHED_DEADLINE_HIER + - Enables hierchical fair queuing in deadline. Not selecting this + option leads to old behavior of deadline. + +CONFIG_IOSCHED_AS_HIER + - Enables hierchical fair queuing in AS. Not selecting this option + leads to old behavior of AS. + +CONFIG_IOSCHED_CFQ_HIER + - Enables hierarchical fair queuing in CFQ. Not selecting this option + still does fair queuing among various queus but it is flat and not + hierarchical. + +CGROUP_BLKIO + - This option enables blkio-cgroup controller for IO tracking + purposes. That means, by this controller one can attribute a write + to the original cgroup and not assume that it belongs to submitting + thread. + +CONFIG_TRACK_ASYNC_CONTEXT + - Currently CFQ attributes the writes to the submitting thread and + caches the async queue pointer in the io context of the process. + If this option is set, it tells cfq and elevator fair queuing logic + that for async writes make use of IO tracking patches and attribute + writes to original cgroup and not to write submitting thread. + + This should be primarily useful when lots of asynchronous writes + are being submitted by pdflush threads and we need to assign the + writes to right group. + +CONFIG_DEBUG_GROUP_IOSCHED + - Throws extra debug messages in blktrace output helpful in doing + doing debugging in hierarchical setup. + + - Also allows for export of extra debug statistics like group queue + and dequeue statistics on device through cgroup interface. + +CONFIG_DEBUG_ELV_FAIR_QUEUING + - Enables some vdisktime related debugging messages. + +Config options selected automatically +===================================== +These config options are not user visible and are selected/deselected +automatically based on IO scheduler configurations. + +CONFIG_ELV_FAIR_QUEUING + - Enables/Disables the fair queuing logic at elevator layer. + +CONFIG_GROUP_IOSCHED + - Enables/Disables hierarchical queuing and associated cgroup bits. + +HOWTO +===== +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable hierarchical scheduling in io scheuduler of your choice (say cfq). + CONFIG_IOSCHED_CFQ_HIER=y + +- Enable IO tracking for async writes. + CONFIG_TRACK_ASYNC_CONTEXT=y + + (This will automatically select CGROUP_BLKIO) + +- Compile and boot into kernel and mount IO controller and blkio io tracking + controller. + + mount -t cgroup -o io,blkio none /cgroup + +- Create two cgroups + mkdir -p /cgroup/test1/ /cgroup/test2 + +- Set weights of group test1 and test2 + echo 1000 > /cgroup/test1/io.weight + echo 500 > /cgroup/test2/io.weight + +- Set "fairness" parameter to 1 at the disk you are testing. + + echo 1 > /sys/block//queue/iosched/fairness + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. Make sure + right io scheduler is being used for the block device where files are + present (the one you compiled in hierarchical mode). + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /cgroup/test1/tasks + cat /cgroup/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /cgroup/test2/tasks + cat /cgroup/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at io.disk_time and io.disk_sectors + files of both test1 and test2 groups. This will tell how much disk time + (in milli seconds), each group got and how many secotors each group + dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Some High Level Test setups +=========================== +One of the use cases of IO controller is to provide some kind of IO isolation +between multiple virtual machines on the same host. Following is one +example setup which worked for me. + + + KVM KVM + Guest1 Guest2 + --------- ---------- + | ----- | | ------ | + | | vdb | | | | vdb | | + | ----- | | ------ | + --------- ---------- + + --------------------------- + | Host | + | ------------- | + | | sdb1 | sdb2 | | + | ------------- | + --------------------------- + +On host machine, I had a spare SATA disk. I created two partitions sdb1 +and sdb2 and gave this partitions as additional storage to kvm guests. sdb1 +to KVM guest1 and sdb2 KVM guest2. These storage appeared as /dev/vdb in +both the guests. Formatted the /dev/vdb and created ext3 file system and +started a 1G file writeout in both the guests. Before writeout I had created +two cgroups of weight 1000 and 500 and put virtual machines in two different +groups. + +Following is write I started in both the guests. + +dd if=/dev/zero of=/mnt/vdc/zerofile1 bs=4K count=262144 conv=fdatasync + +Following are the results on host with "deadline" scheduler. + +group1 time=8:16 17254 group1 sectors=8:16 2104288 +group2 time=8:16 8498 group2 sectors=8:16 1007040 + +Virtual machine with cgroup weight 1000 got almost double the time of virtual +machine with weight 500. + +What Works and What Does not +============================ +Service differentiation at application level can be noticed only if completely +parallel IO paths are created from application to IO scheduler and there +are no serializations introduced by any intermediate layer. For example, +in some cases file system and page cache layer introduce serialization and +we don't see service difference between higher weight and lower weight +process groups. + +For example, when I start an O_SYNC write out on an ext3 file system (file +is being created newly), I see lots of activity from kjournald. I have not +gone into details yet, but my understanding is that there are lot more +journal commits and kjournald kind of introduces serialization between two +processes. So even if you put these two processes in two different cgroups +with different weights, higher weight process will not see more IO done. + +It does work very well when we bypass filesystem layer and IO is raw. For +example in above virtual machine case, host sees raw synchronous writes +coming from two guest machines and filesystem layer at host is not introducing +any kind of serialization hence we can see the service difference. + +It also works very well for reads even on the same file system as for reads +file system journalling activity does not kick in and we can create parallel +IO paths from application to all the way down to IO scheduler and get more +IO done on the IO path with higher weight. + +Regarding "fairness" parameter +============================== +IO controller has introduced a "fairness" tunable for every io scheduler. +Currently this tunable can assume values 0, 1. + +If fairness is set to 1, then IO controller waits for requests to finish from +previous queue before requests from new queue are dispatched. This helps in +doing better accouting of disk time consumed by a queue. If this is not done +then on a queuing hardware, there can be requests from multiple queues and +we will not have any idea which queue consumed how much of disk time. + +Details of cgroup files +======================= +- io.ioprio_class + - Specifies class of the cgroup (RT, BE, IDLE). This is default io + class of the group on all the devices until and unless overridden by + per device rule. (See io.policy). + + 1 = RT; 2 = BE, 3 = IDLE + +- io.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See io.policy). + + Currently allowed range of weights is from 100 to 1000. + +- io.disk_time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- io.disk_sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- io.disk_queue + - Debugging aid only enabled if CONFIG_DEBUG_GROUP_IOSCHED=y. This + gives the statistics about how many a times a group was queued + on service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was queued on a particular device. + +- io.disk_queue + - Debugging aid only enabled if CONFIG_DEBUG_GROUP_IOSCHED=y. This + gives the statistics about how many a times a group was de-queued + or removed from the service tree of the device. This basically gives + and idea if we can generate enough IO to create continuously + backlogged groups. First two fields specify the major and minor + number of the device and third field specifies the number + of times a group was de-queued on a particular device. + +- io.policy + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight and class as + specified by io.weight and io.ioprio_class. + + Following is the format. + + #echo dev_maj:dev_minor weight ioprio_class > /patch/to/cgroup/io.policy + + weight=0 means removing a policy. + + Examples: + + Configure weight=300 ioprio_class=2 on /dev/hdb (8:16) in this cgroup + # echo 8:16 300 2 > io.policy + # cat io.policy + dev weight class + 8:16 300 2 + + Configure weight=500 ioprio_class=1 on /dev/hda (8:0) in this cgroup + # echo 8:0 500 1 > io.policy + # cat io.policy + dev weight class + 8:0 500 1 + 8:16 300 2 + + Remove the policy for /dev/hda in this cgroup + # echo 8:0 0 1 > io.policy + # cat io.policy + dev weight class + 8:16 300 2 + +About configuring request desriptors +==================================== +Traditionally there are 128 request desriptors allocated per request queue +where io scheduler is operating (/sys/block//queue/nr_requests). If these +request descriptors are exhausted, processes will put to sleep and woken +up once request descriptors are available. + +With io controller and cgroup stuff, one can not afford to allocate requests +from single pool as one group might allocate lots of requests and then tasks +from other groups might be put to sleep and this other group might be a +higher weight group. Hence to make sure that a group always can get the +request descriptors it is entitled to, one needs to make request descriptor +limit per group on every queue. + +A new parameter /sys/block//queue/nr_group_requests has been introduced +and this parameter controlls the maximum number of requests per group. +nr_requests still continues to control total number of request descriptors +on the queue. + +Ideally one should set nr_requests to be following. + +nr_requests = number_of_cgroups * nr_group_requests + +This will make sure that at any point of time nr_group_requests number of +request descriptors will be available for any of the cgroups. + +Currently default nr_requests=512 and nr_group_requests=128. This will make +sure that apart from root group one can create 3 more group without running +into any issues. If one decides to create more cgorus, nr_requests and +nr_group_requests should be adjusted accordingly. diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 7e803fc..edcd317 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -2,6 +2,28 @@ if BLOCK menu "IO Schedulers" +config ELV_FAIR_QUEUING + bool + default n + ---help--- + Traditionally only cfq had notion of multiple queues and it did + fair queuing at its own. With the cgroups and need of controlling + IO, now even the simple io schedulers like noop, deadline, as will + have one queue per cgroup and will need hierarchical fair queuing. + Instead of every io scheduler implementing its own fair queuing + logic, this option enables fair queuing in elevator layer so that + other ioschedulers can make use of it. + If unsure, say N. + +config DEBUG_ELV_FAIR_QUEUING + bool "Debug elevator fair queuing" + depends on ELV_FAIR_QUEUING + default n + ---help--- + Enable some debugging hooks for elevator fair queuing support. + Currently it just outputs more information about vdisktime in + blktrace output . + config IOSCHED_NOOP bool default y @@ -12,6 +34,17 @@ config IOSCHED_NOOP that do their own scheduling and require only minimal assistance from the kernel. +config IOSCHED_NOOP_HIER + bool "Noop Hierarchical Scheduling support" + depends on IOSCHED_NOOP && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in noop. In this mode noop keeps + one IO queue per cgroup instead of a global queue. Elevator + fair queuing logic ensures fairness among various queues. + config IOSCHED_AS tristate "Anticipatory I/O scheduler" default y @@ -21,6 +54,18 @@ config IOSCHED_AS deadline I/O scheduler, it can also be slower in some cases especially some database loads. +config IOSCHED_AS_HIER + bool "Anticipatory Hierarchical Scheduling support" + depends on IOSCHED_AS && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in anticipatory. In this mode + anticipatory keeps one IO queue per cgroup instead of a global + queue. Elevator fair queuing logic ensures fairness among various + queues. + config IOSCHED_DEADLINE tristate "Deadline I/O scheduler" default y @@ -31,8 +76,20 @@ config IOSCHED_DEADLINE a disk at any one time, its behaviour is almost identical to the anticipatory I/O scheduler and so is a good choice. +config IOSCHED_DEADLINE_HIER + bool "Deadline Hierarchical Scheduling support" + depends on IOSCHED_DEADLINE && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in deadline. In this mode deadline keeps + one IO queue per cgroup instead of a global queue. Elevator + fair queuing logic ensures fairness among various queues. + config IOSCHED_CFQ tristate "CFQ I/O scheduler" + select ELV_FAIR_QUEUING default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -40,6 +97,14 @@ config IOSCHED_CFQ working environment, suitable for desktop systems. This is the default I/O scheduler. +config IOSCHED_CFQ_HIER + bool "CFQ Hierarchical Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in cfq. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -68,6 +133,30 @@ config DEFAULT_IOSCHED default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP +config DEBUG_GROUP_IOSCHED + bool "Debug Hierarchical Scheduling support" + depends on CGROUPS && GROUP_IOSCHED + default n + ---help--- + Enable some debugging hooks for hierarchical scheduling support. + Currently it just outputs more information in blktrace output. + +config TRACK_ASYNC_CONTEXT + bool "Determine async request context from bio" + depends on GROUP_IOSCHED + select CGROUP_BLKIO + default n + ---help--- + Normally async request is attributed to the task submitting the + request. With group ioscheduling, for accurate accounting of + async writes, one needs to map the request to original task/cgroup + which originated the request and not the submitter of the request. + + Currently there are generic io tracking patches to provide facility + to map bio to original owner. If this option is set, for async + request, original owner of the bio is decided by using io tracking + patches otherwise we continue to attribute the request to the + submitting thread. endmenu endif diff --git a/block/Makefile b/block/Makefile index 6c54ed0..d545323 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,3 +15,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_ELV_FAIR_QUEUING) += elevator-fq.o diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6..8ea9398 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include "elevator-fq.h" /* * See Documentation/block/as-iosched.txt @@ -76,13 +78,8 @@ enum anticipation_status { * or timed out */ }; -struct as_data { - /* - * run time data - */ - - struct request_queue *q; /* the "owner" queue */ - +struct as_queue { + struct io_queue *ioq; /* * requests (as_rq s) are present on both sort_list and fifo_list */ @@ -90,6 +87,28 @@ struct as_data { struct list_head fifo_list[2]; struct request *next_rq[2]; /* next in sort order */ + + /* + * If an as_queue is switched while a batch is running, then we + * store the time left before current batch will expire + */ + long current_batch_time_left; + + /* + * batch data dir when queue was scheduled out. This will be used + * to setup ad->batch_data_dir when queue is scheduled in. + */ + int saved_batch_data_dir; + + unsigned long last_check_fifo[2]; + int write_batch_count; /* max # of reqs in a write batch */ + int current_write_count; /* how many requests left this batch */ + int write_batch_idled; /* has the write batch gone idle? */ + int nr_queued[2]; +}; + +struct as_data { + struct request_queue *q; /* the "owner" queue */ sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ unsigned long exit_prob; /* probability a task will exit while @@ -103,21 +122,17 @@ struct as_data { sector_t new_seek_mean; unsigned long current_batch_expires; - unsigned long last_check_fifo[2]; int changed_batch; /* 1: waiting for old batch to end */ int new_batch; /* 1: waiting on first read complete */ - int batch_data_dir; /* current batch SYNC / ASYNC */ - int write_batch_count; /* max # of reqs in a write batch */ - int current_write_count; /* how many requests left this batch */ - int write_batch_idled; /* has the write batch gone idle? */ enum anticipation_status antic_status; unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ - struct work_struct antic_work; /* Deferred unplugging */ + struct work_struct antic_work; /* Deferred unplugging */ struct io_context *io_context; /* Identify the expected process */ int ioc_finished; /* IO associated with io_context is finished */ int nr_dispatched; + int batch_data_dir; /* current batch SYNC / ASYNC */ /* * settings that change how the i/o scheduler behaves @@ -125,6 +140,9 @@ struct as_data { unsigned long fifo_expire[2]; unsigned long batch_expire[2]; unsigned long antic_expire; + + /* elevator requested a queue switch. */ + int switch_queue; }; /* @@ -146,12 +164,185 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +#define as_log_asq(ad, asq, fmt, args...) \ +{ \ + blk_add_trace_msg((ad)->q, "as %s " fmt, \ + ioq_to_io_group((asq)->ioq)->path, ##args); \ +} +#else +#define as_log_asq(ad, asq, fmt, args...) \ + blk_add_trace_msg((ad)->q, "as " fmt, ##args) +#endif + +#define as_log(ad, fmt, args...) \ + blk_add_trace_msg((ad)->q, "as " fmt, ##args) + static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); +static inline int as_batch_expired(struct as_data *ad, struct as_queue *asq); + +#ifdef CONFIG_IOSCHED_AS_HIER +static void as_save_batch_context(struct as_data *ad, struct as_queue *asq) +{ + /* Save batch data dir */ + asq->saved_batch_data_dir = ad->batch_data_dir; + + if (ad->changed_batch) { + /* + * In case of force expire, we come here. Batch changeover + * has been signalled but we are waiting for all the + * request to finish from previous batch and then start + * the new batch. Can't wait now. Mark that full batch time + * needs to be allocated when this queue is scheduled again. + */ + asq->current_batch_time_left = + ad->batch_expire[ad->batch_data_dir]; + ad->changed_batch = 0; + goto out; + } + + if (ad->new_batch) { + /* + * We should come here only when new_batch has been set + * but no read request has been issued or if it is a forced + * expiry. + * + * In both the cases, new batch has not started yet so + * allocate full batch length for next scheduling opportunity. + * We don't do write batch size adjustment in hierarchical + * AS so that should not be an issue. + */ + asq->current_batch_time_left = + ad->batch_expire[ad->batch_data_dir]; + ad->new_batch = 0; + goto out; + } + + /* Save how much time is left before current batch expires */ + if (as_batch_expired(ad, asq)) + asq->current_batch_time_left = 0; + else { + asq->current_batch_time_left = ad->current_batch_expires + - jiffies; + BUG_ON((asq->current_batch_time_left) < 0); + } + + if (ad->io_context) { + put_io_context(ad->io_context); + ad->io_context = NULL; + } + +out: + as_log_asq(ad, asq, "save batch: dir=%c time_left=%d changed_batch=%d" + " new_batch=%d, antic_status=%d", + ad->batch_data_dir ? 'R' : 'W', + asq->current_batch_time_left, + ad->changed_batch, ad->new_batch, ad->antic_status); + return; +} + +/* + * FIXME: In original AS, read batch's time account started only after when + * first request had completed (if last batch was a write batch). But here + * we might be rescheduling a read batch right away irrespective of the fact + * of disk cache state. + */ +static void as_restore_batch_context(struct as_data *ad, struct as_queue *asq) +{ + /* Adjust the batch expire time */ + if (asq->current_batch_time_left) + ad->current_batch_expires = jiffies + + asq->current_batch_time_left; + /* restore asq batch_data_dir info */ + ad->batch_data_dir = asq->saved_batch_data_dir; + as_log_asq(ad, asq, "restore batch: dir=%c time=%d reads_q=%d" + " writes_q=%d ad->antic_status=%d", + ad->batch_data_dir ? 'R' : 'W', + asq->current_batch_time_left, + asq->nr_queued[1], asq->nr_queued[0], + ad->antic_status); +} + +/* ioq has been set. */ +static void as_active_ioq_set(struct request_queue *q, void *sched_queue, + int coop) +{ + struct as_queue *asq = sched_queue; + struct as_data *ad = q->elevator->elevator_data; + + as_restore_batch_context(ad, asq); +} + +/* + * This is a notification from common layer that it wishes to expire this + * io queue. AS decides whether queue can be expired, if yes, it also + * saves the batch context. + */ +static int as_expire_ioq(struct request_queue *q, void *sched_queue, + int slice_expired, int force) +{ + struct as_data *ad = q->elevator->elevator_data; + int status = ad->antic_status; + struct as_queue *asq = sched_queue; + + as_log_asq(ad, asq, "as_expire_ioq slice_expired=%d, force=%d", + slice_expired, force); + + /* Forced expiry. We don't have a choice */ + if (force) { + as_antic_stop(ad); + /* + * antic_stop() sets antic_status to FINISHED which signifies + * that either we timed out or we found a close request but + * that's not the case here. Start from scratch. + */ + ad->antic_status = ANTIC_OFF; + as_save_batch_context(ad, asq); + ad->switch_queue = 0; + return 1; + } + + /* + * We are waiting for requests to finish from last + * batch. Don't expire the queue now + */ + if (ad->changed_batch) + goto keep_queue; + + /* + * Wait for all requests from existing batch to finish before we + * switch the queue. New queue might change the batch direction + * and this is to be consistent with AS philosophy of not dispatching + * new requests to underlying drive till requests from requests + * from previous batch are completed. + */ + if (ad->nr_dispatched) + goto keep_queue; + + /* + * If AS anticipation is ON, wait for it to finish. + */ + BUG_ON(status == ANTIC_WAIT_REQ); + + if (status == ANTIC_WAIT_NEXT) + goto keep_queue; + + /* We are good to expire the queue. Save batch context */ + as_save_batch_context(ad, asq); + ad->switch_queue = 0; + return 1; + +keep_queue: + /* Mark that elevator requested for queue switch whenever possible */ + ad->switch_queue = 1; + return 0; +} +#endif /* * IO Context helper functions @@ -258,13 +449,14 @@ static void as_put_io_context(struct request *rq) /* * rb tree support functions */ -#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) +#define RQ_RB_ROOT(asq, rq) (&(asq)->sort_list[rq_is_sync((rq))]) static void as_add_rq_rb(struct as_data *ad, struct request *rq) { struct request *alias; + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); - while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { + while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(asq, rq), rq)))) { as_move_to_dispatch(ad, alias); as_antic_stop(ad); } @@ -272,7 +464,9 @@ static void as_add_rq_rb(struct as_data *ad, struct request *rq) static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) { - elv_rb_del(RQ_RB_ROOT(ad, rq), rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); + + elv_rb_del(RQ_RB_ROOT(asq, rq), rq); } /* @@ -366,7 +560,7 @@ as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) * what request to process next. Anticipation works on top of this. */ static struct request * -as_find_next_rq(struct as_data *ad, struct request *last) +as_find_next_rq(struct as_data *ad, struct as_queue *asq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); @@ -382,7 +576,7 @@ as_find_next_rq(struct as_data *ad, struct request *last) else { const int data_dir = rq_is_sync(last); - rbnext = rb_first(&ad->sort_list[data_dir]); + rbnext = rb_first(&asq->sort_list[data_dir]); if (rbnext && rbnext != &last->rb_node) next = rb_entry_rq(rbnext); } @@ -428,6 +622,7 @@ static void as_antic_waitnext(struct as_data *ad) mod_timer(&ad->antic_timer, timeout); ad->antic_status = ANTIC_WAIT_NEXT; + as_log(ad, "antic_waitnext set"); } /* @@ -441,8 +636,10 @@ static void as_antic_waitreq(struct as_data *ad) if (ad->antic_status == ANTIC_OFF) { if (!ad->io_context || ad->ioc_finished) as_antic_waitnext(ad); - else + else { ad->antic_status = ANTIC_WAIT_REQ; + as_log(ad, "antic_waitreq set"); + } } } @@ -454,6 +651,8 @@ static void as_antic_stop(struct as_data *ad) { int status = ad->antic_status; + as_log(ad, "as_antic_stop antic_status=%d", ad->antic_status); + if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { if (status == ANTIC_WAIT_NEXT) del_timer(&ad->antic_timer); @@ -473,6 +672,7 @@ static void as_antic_timeout(unsigned long data) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); + as_log(ad, "as_antic_timeout"); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { struct as_io_context *aic; @@ -651,6 +851,21 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq) struct io_context *ioc; struct as_io_context *aic; +#ifdef CONFIG_IOSCHED_AS_HIER + /* + * If the active asq and rq's asq are not same, then one can not + * break the anticipation. This primarily becomes useful when a + * request is added to a queue which is not being served currently. + */ + if (rq) { + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); + struct as_queue *curr_asq = + elv_active_sched_queue(ad->q->elevator); + + if (asq != curr_asq) + return 0; + } +#endif ioc = ad->io_context; BUG_ON(!ioc); spin_lock(&ioc->lock); @@ -789,9 +1004,10 @@ static int as_can_anticipate(struct as_data *ad, struct request *rq) static void as_update_rq(struct as_data *ad, struct request *rq) { const int data_dir = rq_is_sync(rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); /* keep the next_rq cache up to date */ - ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); + asq->next_rq[data_dir] = as_choose_req(ad, rq, asq->next_rq[data_dir]); /* * have we been anticipating this request? @@ -808,29 +1024,37 @@ static void as_update_rq(struct as_data *ad, struct request *rq) /* * Gathers timings and resizes the write batch automatically */ -static void update_write_batch(struct as_data *ad) +static void update_write_batch(struct as_data *ad, struct request *rq) { unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; long write_time; + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); write_time = (jiffies - ad->current_batch_expires) + batch; if (write_time < 0) write_time = 0; - if (write_time > batch && !ad->write_batch_idled) { + as_log_asq(ad, asq, "upd write: write_time=%d batch=%d" + " write_batch_idled=%d current_write_count=%d", + write_time, batch, asq->write_batch_idled, + asq->current_write_count); + + if (write_time > batch && !asq->write_batch_idled) { if (write_time > batch * 3) - ad->write_batch_count /= 2; + asq->write_batch_count /= 2; else - ad->write_batch_count--; - } else if (write_time < batch && ad->current_write_count == 0) { + asq->write_batch_count--; + } else if (write_time < batch && asq->current_write_count == 0) { if (batch > write_time * 3) - ad->write_batch_count *= 2; + asq->write_batch_count *= 2; else - ad->write_batch_count++; + asq->write_batch_count++; } - if (ad->write_batch_count < 1) - ad->write_batch_count = 1; + if (asq->write_batch_count < 1) + asq->write_batch_count = 1; + + as_log_asq(ad, asq, "upd write count=%d", asq->write_batch_count); } /* @@ -840,6 +1064,7 @@ static void update_write_batch(struct as_data *ad) static void as_completed_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; + struct as_queue *asq = elv_get_sched_queue(q, rq); WARN_ON(!list_empty(&rq->queuelist)); @@ -848,7 +1073,24 @@ static void as_completed_request(struct request_queue *q, struct request *rq) goto out; } + as_log_asq(ad, asq, "complete: reads_q=%d writes_q=%d changed_batch=%d" + " new_batch=%d switch_queue=%d, dir=%c", + asq->nr_queued[1], asq->nr_queued[0], ad->changed_batch, + ad->new_batch, ad->switch_queue, + ad->batch_data_dir ? 'R' : 'W'); + if (ad->changed_batch && ad->nr_dispatched == 1) { + /* + * If this was write batch finishing, adjust the write batch + * length. + * + * Note, write batch length is being calculated upon completion + * of last write request finished and not completion of first + * read request finished in the next batch. + */ + if (ad->batch_data_dir == BLK_RW_SYNC) + update_write_batch(ad, rq); + ad->current_batch_expires = jiffies + ad->batch_expire[ad->batch_data_dir]; kblockd_schedule_work(q, &ad->antic_work); @@ -866,7 +1108,6 @@ static void as_completed_request(struct request_queue *q, struct request *rq) * and writeback caches */ if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { - update_write_batch(ad); ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; ad->new_batch = 0; @@ -885,6 +1126,13 @@ static void as_completed_request(struct request_queue *q, struct request *rq) } as_put_io_context(rq); + + /* + * If elevator requested a queue switch, kick the queue in the + * hope that this is right time for switch. + */ + if (ad->switch_queue) + kblockd_schedule_work(q, &ad->antic_work); out: RQ_SET_STATE(rq, AS_RQ_POSTSCHED); } @@ -901,9 +1149,13 @@ static void as_remove_queued_request(struct request_queue *q, const int data_dir = rq_is_sync(rq); struct as_data *ad = q->elevator->elevator_data; struct io_context *ioc; + struct as_queue *asq = elv_get_sched_queue(q, rq); WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); + BUG_ON(asq->nr_queued[data_dir] <= 0); + asq->nr_queued[data_dir]--; + ioc = RQ_IOC(rq); if (ioc && ioc->aic) { BUG_ON(!atomic_read(&ioc->aic->nr_queued)); @@ -914,8 +1166,8 @@ static void as_remove_queued_request(struct request_queue *q, * Update the "next_rq" cache if we are about to remove its * entry */ - if (ad->next_rq[data_dir] == rq) - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); + if (asq->next_rq[data_dir] == rq) + asq->next_rq[data_dir] = as_find_next_rq(ad, asq, rq); rq_fifo_clear(rq); as_del_rq_rb(ad, rq); @@ -929,23 +1181,23 @@ static void as_remove_queued_request(struct request_queue *q, * * See as_antic_expired comment. */ -static int as_fifo_expired(struct as_data *ad, int adir) +static int as_fifo_expired(struct as_data *ad, struct as_queue *asq, int adir) { struct request *rq; long delta_jif; - delta_jif = jiffies - ad->last_check_fifo[adir]; + delta_jif = jiffies - asq->last_check_fifo[adir]; if (unlikely(delta_jif < 0)) delta_jif = -delta_jif; if (delta_jif < ad->fifo_expire[adir]) return 0; - ad->last_check_fifo[adir] = jiffies; + asq->last_check_fifo[adir] = jiffies; - if (list_empty(&ad->fifo_list[adir])) + if (list_empty(&asq->fifo_list[adir])) return 0; - rq = rq_entry_fifo(ad->fifo_list[adir].next); + rq = rq_entry_fifo(asq->fifo_list[adir].next); return time_after(jiffies, rq_fifo_time(rq)); } @@ -954,7 +1206,7 @@ static int as_fifo_expired(struct as_data *ad, int adir) * as_batch_expired returns true if the current batch has expired. A batch * is a set of reads or a set of writes. */ -static inline int as_batch_expired(struct as_data *ad) +static inline int as_batch_expired(struct as_data *ad, struct as_queue *asq) { if (ad->changed_batch || ad->new_batch) return 0; @@ -964,7 +1216,7 @@ static inline int as_batch_expired(struct as_data *ad) return time_after(jiffies, ad->current_batch_expires); return time_after(jiffies, ad->current_batch_expires) - || ad->current_write_count == 0; + || asq->current_write_count == 0; } /* @@ -973,6 +1225,7 @@ static inline int as_batch_expired(struct as_data *ad) static void as_move_to_dispatch(struct as_data *ad, struct request *rq) { const int data_dir = rq_is_sync(rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); @@ -995,12 +1248,12 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) ad->io_context = NULL; } - if (ad->current_write_count != 0) - ad->current_write_count--; + if (asq->current_write_count != 0) + asq->current_write_count--; } ad->ioc_finished = 0; - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); + asq->next_rq[data_dir] = as_find_next_rq(ad, asq, rq); /* * take it off the sort and fifo list, add to dispatch queue @@ -1014,6 +1267,8 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) if (RQ_IOC(rq) && RQ_IOC(rq)->aic) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); ad->nr_dispatched++; + as_log_asq(ad, asq, "dispatch req dir=%c nr_dispatched = %d", + data_dir ? 'R' : 'W', ad->nr_dispatched); } /* @@ -1024,9 +1279,16 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) static int as_dispatch_request(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; - const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); - const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); struct request *rq; + struct as_queue *asq = elv_select_sched_queue(q, force); + int reads, writes; + + if (!asq) + return 0; + + reads = !list_empty(&asq->fifo_list[BLK_RW_SYNC]); + writes = !list_empty(&asq->fifo_list[BLK_RW_ASYNC]); + if (unlikely(force)) { /* @@ -1042,44 +1304,52 @@ static int as_dispatch_request(struct request_queue *q, int force) ad->changed_batch = 0; ad->new_batch = 0; - while (ad->next_rq[BLK_RW_SYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); + while (asq->next_rq[BLK_RW_SYNC]) { + as_move_to_dispatch(ad, asq->next_rq[BLK_RW_SYNC]); dispatched++; } - ad->last_check_fifo[BLK_RW_SYNC] = jiffies; + asq->last_check_fifo[BLK_RW_SYNC] = jiffies; - while (ad->next_rq[BLK_RW_ASYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); + while (asq->next_rq[BLK_RW_ASYNC]) { + as_move_to_dispatch(ad, asq->next_rq[BLK_RW_ASYNC]); dispatched++; } - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; + asq->last_check_fifo[BLK_RW_ASYNC] = jiffies; + as_log_asq(ad, asq, "forced dispatch"); return dispatched; } /* Signal that the write batch was uncontended, so we can't time it */ if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { - if (ad->current_write_count == 0 || !writes) - ad->write_batch_idled = 1; + if (asq->current_write_count == 0 || !writes) + asq->write_batch_idled = 1; } if (!(reads || writes) || ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT - || ad->changed_batch) + || ad->changed_batch) { + as_log_asq(ad, asq, "no dispatch. read_q=%d, writes_q=%d" + " ad->antic_status=%d, changed_batch=%d," + " switch_queue=%d new_batch=%d", asq->nr_queued[1], + asq->nr_queued[0], ad->antic_status, ad->changed_batch, + ad->switch_queue, ad->new_batch); return 0; + } - if (!(reads && writes && as_batch_expired(ad))) { + if (!(reads && writes && as_batch_expired(ad, asq))) { /* * batch is still running or no reads or no writes */ - rq = ad->next_rq[ad->batch_data_dir]; + rq = asq->next_rq[ad->batch_data_dir]; if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { - if (as_fifo_expired(ad, BLK_RW_SYNC)) + if (as_fifo_expired(ad, asq, BLK_RW_SYNC)) goto fifo_expired; if (as_can_anticipate(ad, rq)) { + as_log_asq(ad, asq, "can_anticipate = 1"); as_antic_waitreq(ad); return 0; } @@ -1099,8 +1369,10 @@ static int as_dispatch_request(struct request_queue *q, int force) * data direction (read / write) */ + as_log_asq(ad, asq, "select a fresh batch and request"); + if (reads) { - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); + BUG_ON(RB_EMPTY_ROOT(&asq->sort_list[BLK_RW_SYNC])); if (writes && ad->batch_data_dir == BLK_RW_SYNC) /* @@ -1113,8 +1385,9 @@ static int as_dispatch_request(struct request_queue *q, int force) ad->changed_batch = 1; } ad->batch_data_dir = BLK_RW_SYNC; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); - ad->last_check_fifo[ad->batch_data_dir] = jiffies; + as_log_asq(ad, asq, "new batch dir is sync"); + rq = rq_entry_fifo(asq->fifo_list[BLK_RW_SYNC].next); + asq->last_check_fifo[ad->batch_data_dir] = jiffies; goto dispatch_request; } @@ -1124,7 +1397,7 @@ static int as_dispatch_request(struct request_queue *q, int force) if (writes) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); + BUG_ON(RB_EMPTY_ROOT(&asq->sort_list[BLK_RW_ASYNC])); if (ad->batch_data_dir == BLK_RW_SYNC) { ad->changed_batch = 1; @@ -1137,10 +1410,11 @@ dispatch_writes: ad->new_batch = 0; } ad->batch_data_dir = BLK_RW_ASYNC; - ad->current_write_count = ad->write_batch_count; - ad->write_batch_idled = 0; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; + as_log_asq(ad, asq, "new batch dir is async"); + asq->current_write_count = asq->write_batch_count; + asq->write_batch_idled = 0; + rq = rq_entry_fifo(asq->fifo_list[BLK_RW_ASYNC].next); + asq->last_check_fifo[BLK_RW_ASYNC] = jiffies; goto dispatch_request; } @@ -1152,9 +1426,9 @@ dispatch_request: * If a request has expired, service it. */ - if (as_fifo_expired(ad, ad->batch_data_dir)) { + if (as_fifo_expired(ad, asq, ad->batch_data_dir)) { fifo_expired: - rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); + rq = rq_entry_fifo(asq->fifo_list[ad->batch_data_dir].next); } if (ad->changed_batch) { @@ -1172,6 +1446,9 @@ fifo_expired: ad->changed_batch = 0; } + if (ad->switch_queue) + return 0; + /* * rq is the selected appropriate request. */ @@ -1187,6 +1464,7 @@ static void as_add_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; int data_dir; + struct as_queue *asq = elv_get_sched_queue(q, rq); RQ_SET_STATE(rq, AS_RQ_NEW); @@ -1194,6 +1472,11 @@ static void as_add_request(struct request_queue *q, struct request *rq) rq->elevator_private = as_get_io_context(q->node); + asq->nr_queued[data_dir]++; + as_log_asq(ad, asq, "add a %c request read_q=%d write_q=%d", + data_dir ? 'R' : 'W', asq->nr_queued[1], + asq->nr_queued[0]); + if (RQ_IOC(rq)) { as_update_iohist(ad, RQ_IOC(rq)->aic, rq); atomic_inc(&RQ_IOC(rq)->aic->nr_queued); @@ -1205,7 +1488,7 @@ static void as_add_request(struct request_queue *q, struct request *rq) * set expire time and add to fifo list */ rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); - list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &asq->fifo_list[data_dir]); as_update_rq(ad, rq); /* keep state machine up to date */ RQ_SET_STATE(rq, AS_RQ_QUEUED); @@ -1227,31 +1510,20 @@ static void as_deactivate_request(struct request_queue *q, struct request *rq) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); } -/* - * as_queue_empty tells us if there are requests left in the device. It may - * not be the case that a driver can get the next request even if the queue - * is not empty - it is used in the block layer to check for plugging and - * merging opportunities - */ -static int as_queue_empty(struct request_queue *q) -{ - struct as_data *ad = q->elevator->elevator_data; - - return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) - && list_empty(&ad->fifo_list[BLK_RW_SYNC]); -} - static int as_merge(struct request_queue *q, struct request **req, struct bio *bio) { - struct as_data *ad = q->elevator->elevator_data; sector_t rb_key = bio->bi_sector + bio_sectors(bio); struct request *__rq; + struct as_queue *asq = elv_get_sched_queue_bio(q, bio); + + if (!asq) + return ELEVATOR_NO_MERGE; /* * check for front merge */ - __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key); + __rq = elv_rb_find(&asq->sort_list[bio_data_dir(bio)], rb_key); if (__rq && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; @@ -1334,6 +1606,42 @@ static int as_may_queue(struct request_queue *q, int rw) return ret; } +/* Called with queue lock held */ +static void *as_alloc_as_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) +{ + struct as_queue *asq; + struct as_data *ad = eq->elevator_data; + + asq = kmalloc_node(sizeof(*asq), gfp_mask | __GFP_ZERO, q->node); + if (asq == NULL) + goto out; + + INIT_LIST_HEAD(&asq->fifo_list[BLK_RW_SYNC]); + INIT_LIST_HEAD(&asq->fifo_list[BLK_RW_ASYNC]); + asq->sort_list[BLK_RW_SYNC] = RB_ROOT; + asq->sort_list[BLK_RW_ASYNC] = RB_ROOT; + if (ad) + asq->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; + else + asq->write_batch_count = default_write_batch_expire / 10; + + if (asq->write_batch_count < 2) + asq->write_batch_count = 2; + asq->ioq = ioq; +out: + return asq; +} + +static void as_free_as_queue(struct elevator_queue *e, void *sched_queue) +{ + struct as_queue *asq = sched_queue; + + BUG_ON(!list_empty(&asq->fifo_list[BLK_RW_SYNC])); + BUG_ON(!list_empty(&asq->fifo_list[BLK_RW_ASYNC])); + kfree(asq); +} + static void as_exit_queue(struct elevator_queue *e) { struct as_data *ad = e->elevator_data; @@ -1341,9 +1649,6 @@ static void as_exit_queue(struct elevator_queue *e) del_timer_sync(&ad->antic_timer); cancel_work_sync(&ad->antic_work); - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); - put_io_context(ad->io_context); kfree(ad); } @@ -1351,7 +1656,7 @@ static void as_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (as_data). */ -static void *as_init_queue(struct request_queue *q) +static void *as_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct as_data *ad; @@ -1367,10 +1672,6 @@ static void *as_init_queue(struct request_queue *q) init_timer(&ad->antic_timer); INIT_WORK(&ad->antic_work, as_work_handler); - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); - ad->sort_list[BLK_RW_SYNC] = RB_ROOT; - ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; ad->antic_expire = default_antic_expire; @@ -1378,9 +1679,7 @@ static void *as_init_queue(struct request_queue *q) ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; - ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; - if (ad->write_batch_count < 2) - ad->write_batch_count = 2; + ad->switch_queue = 0; return ad; } @@ -1466,6 +1765,11 @@ static struct elv_fs_entry as_attrs[] = { AS_ATTR(antic_expire), AS_ATTR(read_batch_expire), AS_ATTR(write_batch_expire), +#ifdef CONFIG_IOSCHED_AS_HIER + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), +#endif __ATTR_NULL }; @@ -1478,7 +1782,6 @@ static struct elevator_type iosched_as = { .elevator_add_req_fn = as_add_request, .elevator_activate_req_fn = as_activate_request, .elevator_deactivate_req_fn = as_deactivate_request, - .elevator_queue_empty_fn = as_queue_empty, .elevator_completed_req_fn = as_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, @@ -1486,8 +1789,16 @@ static struct elevator_type iosched_as = { .elevator_init_fn = as_init_queue, .elevator_exit_fn = as_exit_queue, .trim = as_trim, + .elevator_alloc_sched_queue_fn = as_alloc_as_queue, + .elevator_free_sched_queue_fn = as_free_as_queue, +#ifdef CONFIG_IOSCHED_AS_HIER + .elevator_expire_ioq_fn = as_expire_ioq, + .elevator_active_ioq_set_fn = as_active_ioq_set, }, - + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, +#else + }, +#endif .elevator_attrs = as_attrs, .elevator_name = "anticipatory", .elevator_owner = THIS_MODULE, diff --git a/block/blk-core.c b/block/blk-core.c index e3299a7..112a629 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -90,6 +90,27 @@ void blk_queue_congestion_threshold(struct request_queue *q) q->nr_congestion_off = nr; } +#ifdef CONFIG_GROUP_IOSCHED +int blk_queue_io_group_congested(struct backing_dev_info *bdi, int bdi_bits, + struct page *page) +{ + int ret = 0; + struct request_queue *q = bdi->unplug_io_data; + + if (!q || !q->elevator) + return bdi_congested(bdi, bdi_bits); + + /* Do we need to hold queue lock? */ + if (bdi_bits & (1 << BDI_sync_congested)) + ret |= elv_page_io_group_congested(q, page, 1); + + if (bdi_bits & (1 << BDI_async_congested)) + ret |= elv_page_io_group_congested(q, page, 0); + + return ret; +} +#endif + /** * blk_get_backing_dev_info - get the address of a queue's backing_dev_info * @bdev: device @@ -460,20 +481,53 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +struct request_list * +blk_get_request_list(struct request_queue *q, struct bio *bio) +{ +#ifdef CONFIG_GROUP_IOSCHED + /* + * Determine which request list bio will be allocated from. This + * is dependent on which io group bio belongs to + */ + return elv_get_request_list_bio(q, bio); +#else + return &q->rq; +#endif +} + +static struct request_list *rq_rl(struct request_queue *q, struct request *rq) +{ +#ifdef CONFIG_GROUP_IOSCHED + int priv = rq->cmd_flags & REQ_ELVPRIV; + + return elv_get_request_list_rq(q, rq, priv); +#else + return &q->rq; +#endif +} + +void blk_init_request_list(struct request_list *rl) { - struct request_list *rl = &q->rq; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; - rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); +} - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); +static int blk_init_free_list(struct request_queue *q) +{ + /* + * In case of group scheduling, request list is inside group and is + * initialized when group is instanciated. + */ +#ifndef CONFIG_GROUP_IOSCHED + blk_init_request_list(&q->rq); +#endif + q->rq_data.rq_pool = mempool_create_node(BLKDEV_MIN_RQ, + mempool_alloc_slab, mempool_free_slab, + request_cachep, q->node); - if (!rl->rq_pool) + if (!q->rq_data.rq_pool) return -ENOMEM; return 0; @@ -581,6 +635,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; + /* init starved waiter wait queue */ + init_waitqueue_head(&q->rq_data.starved_wait); + /* * This also sets hw/phys segments, boundary and size */ @@ -615,13 +672,14 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) { if (rq->cmd_flags & REQ_ELVPRIV) elv_put_request(q, rq); - mempool_free(rq, q->rq.rq_pool); + mempool_free(rq, q->rq_data.rq_pool); } static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, struct bio *bio, int flags, int priv, + gfp_t gfp_mask) { - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + struct request *rq = mempool_alloc(q->rq_data.rq_pool, gfp_mask); if (!rq) return NULL; @@ -631,8 +689,8 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) rq->cmd_flags = flags | REQ_ALLOCED; if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); + if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { + mempool_free(rq, q->rq_data.rq_pool); return NULL; } rq->cmd_flags |= REQ_ELVPRIV; @@ -675,18 +733,20 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_queue *q, int sync, + struct request_list *rl) { - struct request_list *rl = &q->rq; - - if (rl->count[sync] < queue_congestion_off_threshold(q)) + if (q->rq_data.count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); - if (rl->count[sync] + 1 <= q->nr_requests) { + if (q->rq_data.count[sync] + 1 <= q->nr_requests) + blk_clear_queue_full(q, sync); + + elv_freed_request(rl, sync); + + if (rl->count[sync] + 1 <= q->nr_group_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - - blk_clear_queue_full(q, sync); } } @@ -694,63 +754,133 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) +static void freed_request(struct request_queue *q, int sync, int priv, + struct request_list *rl) { - struct request_list *rl = &q->rq; + /* + * There is a window during request allocation where request is + * mapped to one group but by the time a queue for the group is + * allocated, it is possible that original cgroup/io group has been + * deleted and now io queue is allocated in a different group (root) + * altogether. + * + * One solution to the problem is that rq should take io group + * reference. But it looks too much to do that to solve this issue. + * The only side affect to the hard to hit issue seems to be that + * we will try to decrement the rl->count for a request list which + * did not allocate that request. Chcek for rl->count going less than + * zero and do not decrement it if that's the case. + */ + + if (priv && rl->count[sync] > 0) + rl->count[sync]--; + + BUG_ON(!q->rq_data.count[sync]); + q->rq_data.count[sync]--; - rl->count[sync]--; if (priv) - rl->elvpriv--; + q->rq_data.elvpriv--; - __freed_request(q, sync); + __freed_request(q, sync, rl); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(q, sync ^ 1, rl); + + /* Wake up the starved process on global list, if any */ + if (unlikely(q->rq_data.starved)) { + if (waitqueue_active(&q->rq_data.starved_wait)) + wake_up(&q->rq_data.starved_wait); + q->rq_data.starved--; + } +} + +/* + * Returns whether one can sleep on this request list or not. There are + * cases (elevator switch) where request list might not have allocated + * any request descriptor but we deny request allocation due to gloabl + * limits. In that case one should sleep on global list as on this request + * list no wakeup will take place. + * + * Also sets the request list starved flag if there are no requests pending + * in the direction of rq. + * + * Return 1 --> sleep on request list, 0 --> sleep on global list + */ +static int can_sleep_on_request_list(struct request_list *rl, int is_sync) +{ + if (unlikely(rl->count[is_sync] == 0)) { + /* + * If there is a request pending in other direction + * in same io group, then set the starved flag of + * the group request list. Otherwise, we need to + * make this process sleep in global starved list + * to make sure it will not sleep indefinitely. + */ + if (rl->count[is_sync ^ 1] != 0) { + rl->starved[is_sync] = 1; + return 1; + } else + return 0; + } + + return 1; } /* * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. + * Returns NULL on failure, with queue_lock held. Also sets the "reason" field + * in case of failure. This reason field helps caller decide to whether sleep + * on per group list or global per queue list. + * reason = 0 sleep on per group list + * reason = 1 sleep on global list + * * Returns !NULL on success, with queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, gfp_t gfp_mask, + struct request_list *rl, int *reason) { struct request *rq = NULL; - struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue, priv; + int sleep_on_global = 0; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; - if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[is_sync]+1 >= q->nr_requests) { - ioc = current_io_context(GFP_ATOMIC, q->node); - /* - * The queue will fill after this allocation, so set - * it as full, and mark this process as "batching". - * This process will be allowed to complete a batch of - * requests, others will be blocked. - */ - if (!blk_queue_full(q, is_sync)) { - ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - goto out; - } + if (q->rq_data.count[is_sync]+1 >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, is_sync); + + /* check if io group will get congested after this allocation*/ + elv_get_request(rl, is_sync); + + /* queue full seems redundant now */ + if (q->rq_data.count[is_sync]+1 >= q->nr_requests) + blk_set_queue_full(q, is_sync); + + if (rl->count[is_sync]+1 >= q->nr_group_requests) { + ioc = current_io_context(GFP_ATOMIC, q->node); + /* + * The queue request descriptor group will fill after this + * allocation, so set it as full, and mark this process as + * "batching". This process will be allowed to complete a + * batch of requests, others will be blocked. + */ + if (rl->count[is_sync] <= q->nr_group_requests) + ioc_set_batching(q, ioc); + else { + if (may_queue != ELV_MQUEUE_MUST + && !ioc_batching(q, ioc)) { + /* + * The queue is full and the allocating + * process is not a "batcher", and not + * exempted by the IO scheduler + */ + goto out; } } - blk_set_queue_congested(q, is_sync); } /* @@ -758,21 +888,60 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests */ - if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) + + if (q->rq_data.count[is_sync] >= (3 * q->nr_requests / 2)) { + /* + * Queue is too full for allocation. On which request queue + * the task should sleep? Generally it should sleep on its + * request list but if elevator switch is happening, in that + * window, request descriptors are allocated from global + * pool and are not accounted against any particular request + * list as group is going away. + * + * So it might happen that request list does not have any + * requests allocated at all and if process sleeps on per + * group request list, it will not be woken up. In such case, + * make it sleep on global starved list. + */ + if (test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags) + || !can_sleep_on_request_list(rl, is_sync)) + sleep_on_global = 1; + goto out; + } + + /* + * Allocation of request is allowed from queue perspective. Now check + * from per group request list + */ + + if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2)) goto out; - rl->count[is_sync]++; rl->starved[is_sync] = 0; + q->rq_data.count[is_sync]++; + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (priv) { + q->rq_data.elvpriv++; + /* + * Account the request to request list only if request is + * going to elevator. During elevator switch, there will + * be small window where group is going away and new group + * will not be allocated till elevator switch is complete. + * So till then instead of slowing down the application, + * we will continue to allocate request from total common + * pool instead of per group limit + */ + rl->count[is_sync]++; + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + rq = blk_alloc_request(q, bio, rw_flags, priv, gfp_mask); + if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -782,7 +951,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + freed_request(q, is_sync, priv, rl); /* * in the very unlikely event that allocation failed and no @@ -792,9 +961,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * rq mempool into READ and WRITE */ rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - + if (!can_sleep_on_request_list(rl, is_sync)) + sleep_on_global = 1; goto out; } @@ -809,6 +977,8 @@ rq_starved: trace_block_getrq(q, bio, rw_flags & 1); out: + if (reason && sleep_on_global) + *reason = 1; return rq; } @@ -822,16 +992,39 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) { const bool is_sync = rw_is_sync(rw_flags) != 0; + int sleep_on_global = 0; struct request *rq; + struct request_list *rl = blk_get_request_list(q, bio); - rq = get_request(q, rw_flags, bio, GFP_NOIO); + rq = get_request(q, rw_flags, bio, GFP_NOIO, rl, &sleep_on_global); while (!rq) { DEFINE_WAIT(wait); struct io_context *ioc; - struct request_list *rl = &q->rq; - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); + if (sleep_on_global) { + /* + * Task failed allocation and needs to wait and + * try again. There are no requests pending from + * the io group hence need to sleep on global + * wait queue. Most likely the allocation failed + * because of memory issues. + */ + + q->rq_data.starved++; + prepare_to_wait_exclusive(&q->rq_data.starved_wait, + &wait, TASK_UNINTERRUPTIBLE); + } else { + /* + * We are about to sleep on a request list and we + * drop queue lock. After waking up, we will do + * finish_wait() on request list and in the mean + * time group might be gone. Take a reference to + * the group now. + */ + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, + TASK_UNINTERRUPTIBLE); + elv_get_rl_iog(rl); + } trace_block_sleeprq(q, bio, rw_flags & 1); @@ -849,9 +1042,25 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, ioc_set_batching(q, ioc); spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); - rq = get_request(q, rw_flags, bio, GFP_NOIO); + if (sleep_on_global) { + finish_wait(&q->rq_data.starved_wait, &wait); + sleep_on_global = 0; + } else { + /* + * We had taken a reference to the rl/iog. Put that now + */ + finish_wait(&rl->wait[is_sync], &wait); + elv_put_rl_iog(rl); + } + + /* + * After the sleep check the rl again in case cgrop bio + * belonged to is gone and it is mapped to root group now + */ + rl = blk_get_request_list(q, bio); + rq = get_request(q, rw_flags, bio, GFP_NOIO, rl, + &sleep_on_global); }; return rq; @@ -860,14 +1069,16 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; + struct request_list *rl; BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); + rl = blk_get_request_list(q, NULL); if (gfp_mask & __GFP_WAIT) { rq = get_request_wait(q, rw, NULL); } else { - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask, rl, NULL); if (!rq) spin_unlock_irq(q->queue_lock); } @@ -1084,12 +1295,13 @@ void __blk_put_request(struct request_queue *q, struct request *req) if (req->cmd_flags & REQ_ALLOCED) { int is_sync = rq_is_sync(req) != 0; int priv = req->cmd_flags & REQ_ELVPRIV; + struct request_list *rl = rq_rl(q, req); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, is_sync, priv); + freed_request(q, is_sync, priv, rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d4ed600..890d475 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,24 +84,31 @@ void exit_io_context(void) } } +void init_io_context(struct io_context *ioc) +{ + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); + ioc->ioprio_changed = 0; + ioc->ioprio = 0; +#ifdef CONFIG_GROUP_IOSCHED + ioc->cgroup_changed = 0; +#endif + ioc->last_waited = jiffies; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + ioc->aic = NULL; + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); + ioc->ioc_data = NULL; +} + struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ret; ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_long_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; - } + if (ret) + init_io_context(ret); return ret; } diff --git a/block/blk-settings.c b/block/blk-settings.c index 476d870..c3102c7 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -149,6 +149,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) * set defaults */ q->nr_requests = BLKDEV_MAX_RQ; + q->nr_group_requests = BLKDEV_MAX_GROUP_RQ; q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 418d636..e0af5d6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -38,42 +38,67 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl = &q->rq; + struct request_list *rl; unsigned long nr; int ret = queue_var_store(&nr, page, count); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; spin_lock_irq(q->queue_lock); + rl = blk_get_request_list(q, NULL); q->nr_requests = nr; blk_queue_congestion_threshold(q); - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + if (q->rq_data.count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + else if (q->rq_data.count[BLK_RW_SYNC] < + queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + if (q->rq_data.count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + else if (q->rq_data.count[BLK_RW_ASYNC] < + queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_ASYNC); - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + if (q->rq_data.count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_SYNC); - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { + } else if (q->rq_data.count[BLK_RW_SYNC]+1 <= q->nr_requests) { blk_clear_queue_full(q, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + if (q->rq_data.count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_ASYNC); - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { + } else if (q->rq_data.count[BLK_RW_ASYNC]+1 <= q->nr_requests) { blk_clear_queue_full(q, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } spin_unlock_irq(q->queue_lock); return ret; } +#ifdef CONFIG_GROUP_IOSCHED +static ssize_t queue_group_requests_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->nr_group_requests, (page)); +} + +static ssize_t queue_group_requests_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long nr; + int ret = queue_var_store(&nr, page, count); + + if (nr < BLKDEV_MIN_RQ) + nr = BLKDEV_MIN_RQ; + + spin_lock_irq(q->queue_lock); + q->nr_group_requests = nr; + elv_updated_nr_group_requests(q); + spin_unlock_irq(q->queue_lock); + return ret; +} +#endif static ssize_t queue_ra_show(struct request_queue *q, char *page) { @@ -240,6 +265,14 @@ static struct queue_sysfs_entry queue_requests_entry = { .store = queue_requests_store, }; +#ifdef CONFIG_GROUP_IOSCHED +static struct queue_sysfs_entry queue_group_requests_entry = { + .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR }, + .show = queue_group_requests_show, + .store = queue_group_requests_store, +}; +#endif + static struct queue_sysfs_entry queue_ra_entry = { .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, .show = queue_ra_show, @@ -314,6 +347,9 @@ static struct queue_sysfs_entry queue_iostats_entry = { static struct attribute *default_attrs[] = { &queue_requests_entry.attr, +#ifdef CONFIG_GROUP_IOSCHED + &queue_group_requests_entry.attr, +#endif &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -393,12 +429,11 @@ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - struct request_list *rl = &q->rq; blk_sync_queue(q); - if (rl->rq_pool) - mempool_destroy(rl->rq_pool); + if (q->rq_data.rq_pool) + mempool_destroy(q->rq_data.rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); diff --git a/block/blk.h b/block/blk.h index 3fae6ad..d05b4cf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -1,6 +1,8 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H +#include "elevator-fq.h" + /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -71,6 +73,8 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + elv_activate_rq_fair(q, rq); + if (e->ops->elevator_activate_req_fn) e->ops->elevator_activate_req_fn(q, rq); } @@ -79,6 +83,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq { struct elevator_queue *e = q->elevator; + elv_deactivate_rq_fair(q, rq); + if (e->ops->elevator_deactivate_req_fn) e->ops->elevator_deactivate_req_fn(q, rq); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd7080e..034b5ca 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -12,6 +12,7 @@ #include #include #include +#include "elevator-fq.h" /* * tunables @@ -23,17 +24,10 @@ static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; static const int cfq_back_max = 16 * 1024; /* penalty of a backwards seek */ static const int cfq_back_penalty = 2; -static const int cfq_slice_sync = HZ / 10; -static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; /* - * offset from end of service tree - */ -#define CFQ_IDLE_DELAY (HZ / 5) - -/* * below this threshold, we consider thinktime immediate */ #define CFQ_MIN_TT (2) @@ -43,7 +37,7 @@ static int cfq_slice_idle = HZ / 125; #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) +#define RQ_CFQQ(rq) (struct cfq_queue *) (elv_ioq_sched_queue((rq)->ioq)) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -53,8 +47,6 @@ static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); #define CFQ_PRIO_LISTS IOPRIO_BE_NR -#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) @@ -74,16 +66,11 @@ struct cfq_rb_root { * Per process-grouping structure */ struct cfq_queue { - /* reference count */ - atomic_t ref; + struct io_queue *ioq; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; /* prio tree member */ struct rb_node p_node; /* prio tree root we belong to, if any */ @@ -99,18 +86,13 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; - unsigned long slice_end; - long slice_resid; unsigned int slice_dispatch; /* pending metadata requests */ int meta_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short org_ioprio, org_ioprio_class; pid_t pid; }; @@ -120,12 +102,6 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; - /* * Each priority tree is sorted by next_request position. These * trees are used when determining if two or more queues are @@ -133,14 +109,6 @@ struct cfq_data { */ struct rb_root prio_trees[CFQ_PRIO_LISTS]; - unsigned int busy_queues; - /* - * Used to track any pending rt requests so we can pre-empt current - * non-RT cfqq in service when this value is non-zero. - */ - unsigned int busy_rt_queues; - - int rq_in_driver; int sync_flight; /* @@ -151,21 +119,8 @@ struct cfq_data { int hw_tag_samples; int rq_in_driver_peak; - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; struct cfq_io_context *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -175,7 +130,6 @@ struct cfq_data { unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; - unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; @@ -188,16 +142,10 @@ struct cfq_data { }; enum cfqq_state_flags { - CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ - CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ - CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ - CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ - CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ }; @@ -215,16 +163,10 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ } -CFQ_CFQQ_FNS(on_rr); -CFQ_CFQQ_FNS(wait_request); -CFQ_CFQQ_FNS(must_dispatch); CFQ_CFQQ_FNS(must_alloc); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); -CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); -CFQ_CFQQ_FNS(slice_new); -CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); #undef CFQ_CFQQ_FNS @@ -234,8 +176,8 @@ CFQ_CFQQ_FNS(coop); blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, - struct io_context *, gfp_t); +static struct cfq_queue *cfq_get_queue(struct cfq_data *, struct bio *bio, + int, struct io_context *, gfp_t); static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); @@ -245,84 +187,79 @@ static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, return cic->cfqq[!!is_sync]; } -static inline void cic_set_cfqq(struct cfq_io_context *cic, - struct cfq_queue *cfqq, int is_sync) -{ - cic->cfqq[!!is_sync] = cfqq; -} - /* - * We regard a request as SYNC, if it's either a read or has the SYNC bit - * set (in which case it could also be direct WRITE). + * Determine the cfq queue bio should go in. This is primarily used by + * front merge and allow merge functions. + * + * Currently this function takes the ioprio and iprio_class from task + * submitting async bio. Later save the task information in the page_cgroup + * and retrieve task's ioprio and class from there. */ -static inline int cfq_bio_sync(struct bio *bio) +static struct cfq_queue *cic_bio_to_cfqq(struct cfq_data *cfqd, + struct cfq_io_context *cic, struct bio *bio, int is_sync) { - if (bio_data_dir(bio) == READ || bio_sync(bio)) - return 1; + struct cfq_queue *cfqq = NULL; - return 0; -} + cfqq = cic_to_cfqq(cic, is_sync); -/* - * scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing - */ -static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) -{ - if (cfqd->busy_queues) { - cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + if (!cfqq && !is_sync) { + const int ioprio = task_ioprio(cic->ioc); + const int ioprio_class = task_ioprio_class(cic->ioc); + struct io_group *iog; + /* + * async bio tracking is enabled and we are not caching + * async queue pointer in cic. + */ + iog = elv_io_get_io_group_bio(cfqd->queue, bio, 0); + if (!iog) { + /* + * May be this is first rq/bio and io group has not + * been setup yet. + */ + return NULL; + } + return elv_io_group_async_queue_prio(iog, ioprio_class, ioprio); } +#endif + return cfqq; } -static int cfq_queue_empty(struct request_queue *q) +static inline void cic_set_cfqq(struct cfq_io_context *cic, + struct cfq_queue *cfqq, int is_sync) { - struct cfq_data *cfqd = q->elevator->elevator_data; - - return !cfqd->busy_queues; +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + /* + * Don't cache async queue pointer as now one io context might + * be submitting async io for various different async queues + */ + if (!is_sync) + return; +#endif + cic->cfqq[!!is_sync] = cfqq; } -/* - * Scale schedule slice based on io priority. Use the sync time slice only - * if a queue is marked sync and has sync io queued. A sync queue with async - * io only, should not get full sync slice length. - */ -static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, - unsigned short prio) +static inline struct io_group *cfqq_to_io_group(struct cfq_queue *cfqq) { - const int base_slice = cfqd->cfq_slice[sync]; - - WARN_ON(prio >= IOPRIO_BE_NR); - - return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); + return ioq_to_io_group(cfqq->ioq); } -static inline int -cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline int cfq_class_idle(struct cfq_queue *cfqq) { - return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); + return elv_ioq_class_idle(cfqq->ioq); } -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline int cfq_cfqq_sync(struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; - cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); + return elv_ioq_sync(cfqq->ioq); } -/* - * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end - * isn't valid until the first request from the dispatch is activated - * and the slice time set. - */ -static inline int cfq_slice_used(struct cfq_queue *cfqq) +static inline int cfqq_is_active_queue(struct cfq_queue *cfqq) { - if (cfq_cfqq_slice_new(cfqq)) - return 0; - if (time_before(jiffies, cfqq->slice_end)) - return 0; + struct cfq_data *cfqd = cfqq->cfqd; + struct elevator_queue *e = cfqd->queue->elevator; - return 1; + return (elv_active_sched_queue(e) == cfqq); } /* @@ -421,33 +358,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) } /* - * The below is leftmost cache rbtree addon - */ -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) -{ - if (!root->left) - root->left = rb_first(&root->rb); - - if (root->left) - return rb_entry(root->left, struct cfq_queue, rb_node); - - return NULL; -} - -static void rb_erase_init(struct rb_node *n, struct rb_root *root) -{ - rb_erase(n, root); - RB_CLEAR_NODE(n); -} - -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) -{ - if (root->left == n) - root->left = NULL; - rb_erase_init(n, &root->rb); -} - -/* * would be nice to take fifo expire time into account as well */ static struct request * @@ -474,95 +384,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, return cfq_choose_req(cfqd, next, prev); } -static unsigned long cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - /* - * just an approximation, should be ok. - */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); -} - -/* - * The cfqd->service_tree holds all pending cfq_queue's that have - * requests waiting to be processed. It is sorted in the order that - * we will service the queues. - */ -static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int add_front) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - unsigned long rb_key; - int left; - - if (cfq_class_idle(cfqq)) { - rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); - if (parent && parent != &cfqq->rb_node) { - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - rb_key += __cfqq->rb_key; - } else - rb_key += jiffies; - } else if (!add_front) { - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; - rb_key += cfqq->slice_resid; - cfqq->slice_resid = 0; - } else - rb_key = 0; - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - /* - * same position, nothing more to do - */ - if (rb_key == cfqq->rb_key) - return; - - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); - } - - left = 1; - parent = NULL; - p = &cfqd->service_tree.rb.rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - - /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. - */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (rb_key < __cfqq->rb_key) - n = &(*p)->rb_left; - else - n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) - left = 0; - - p = n; - } - - if (left) - cfqd->service_tree.left = &cfqq->rb_node; - - cfqq->rb_key = rb_key; - rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); -} - static struct cfq_queue * cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -624,57 +445,43 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } -/* - * Update cfqq's position in the service tree. - */ -static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) +/* An active ioq is being reset. A chance to do cic related stuff. */ +static void cfq_active_ioq_reset(struct request_queue *q, void *sched_queue) { - /* - * Resorting requires the cfqq to be on the RR list already. - */ - if (cfq_cfqq_on_rr(cfqq)) { - cfq_service_tree_add(cfqd, cfqq, 0); - cfq_prio_tree_add(cfqd, cfqq); - } -} + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = sched_queue; -/* - * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to last request service - */ -static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - cfq_mark_cfqq_on_rr(cfqq); - cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; + if (cfqd->active_cic) { + put_io_context(cfqd->active_cic->ioc); + cfqd->active_cic = NULL; + } - cfq_resort_rr_list(cfqd, cfqq); + /* Resort the cfqq in prio tree */ + if (cfqq) + cfq_prio_tree_add(cfqd, cfqq); } -/* - * Called when the cfqq no longer has requests pending, remove it from - * the service tree. - */ -static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) +/* An ioq has been set as active one. */ +static void cfq_active_ioq_set(struct request_queue *q, void *sched_queue, + int coop) { - cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - cfq_clear_cfqq_on_rr(cfqq); + struct cfq_queue *cfqq = sched_queue; - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } + cfqq->slice_dispatch = 0; - BUG_ON(!cfqd->busy_queues); - cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; + cfq_clear_cfqq_must_alloc_slice(cfqq); + cfq_clear_cfqq_fifo_expire(cfqq); + + /* + * If queue was selected because it was a close cooperator, then + * mark it so that it is not selected again and again. Otherwise + * clear the coop flag so that it becomes eligible to get selected + * again. + */ + if (coop) + cfq_mark_cfqq_coop(cfqq); + else + cfq_clear_cfqq_coop(cfqq); } /* @@ -683,7 +490,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -691,8 +497,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + /* + * If this was last request in the queue, remove this queue from + * prio trees. For last request nr_queued count will still be 1 as + * elevator fair queuing layer is yet to do the accounting. + */ + if (elv_ioq_nr_queued(cfqq->ioq) == 1) { + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -710,9 +525,6 @@ static void cfq_add_rq_rb(struct request *rq) while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) cfq_dispatch_insert(cfqd->queue, __alias); - if (!cfq_cfqq_on_rr(cfqq)) - cfq_add_cfqq_rr(cfqd, cfqq); - /* * check if this request is a better next-serve candidate */ @@ -720,7 +532,9 @@ static void cfq_add_rq_rb(struct request *rq) cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); /* - * adjust priority tree position, if ->next_rq changes + * adjust priority tree position, if ->next_rq changes. This should + * also take care of adding a new queue to prio tree as if this is + * first request then prev would be null and cfqq->next_rq will not. */ if (prev != cfqq->next_rq) cfq_prio_tree_add(cfqd, cfqq); @@ -746,7 +560,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) if (!cic) return NULL; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_bio_to_cfqq(cfqd, cic, bio, elv_bio_sync(bio)); if (cfqq) { sector_t sector = bio->bi_sector + bio_sectors(bio); @@ -760,23 +574,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver++; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); - cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } -static void cfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); -} - static void cfq_remove_request(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); @@ -843,7 +643,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, /* * Disallow merge of a sync bio into an async request. */ - if (cfq_bio_sync(bio) && !rq_is_sync(rq)) + if (elv_bio_sync(bio) && !rq_is_sync(rq)) return 0; /* @@ -854,100 +654,28 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, if (!cic) return 0; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_bio_to_cfqq(cfqd, cic, bio, elv_bio_sync(bio)); if (cfqq == RQ_CFQQ(rq)) return 1; return 0; } -static void __cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); - cfqq->slice_end = 0; - cfqq->slice_dispatch = 0; - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_must_dispatch(cfqq); - cfq_clear_cfqq_must_alloc_slice(cfqq); - cfq_clear_cfqq_fifo_expire(cfqq); - cfq_mark_cfqq_slice_new(cfqq); - - del_timer(&cfqd->idle_slice_timer); - } - - cfqd->active_queue = cfqq; -} - /* * current cfqq expired its slice (or was too idle), select new one */ static void -__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int timed_out) +__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); - - if (cfq_cfqq_wait_request(cfqq)) - del_timer(&cfqd->idle_slice_timer); - - cfq_clear_cfqq_wait_request(cfqq); - - /* - * store what was left of this slice, if the queue idled/timed out - */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) { - cfqq->slice_resid = cfqq->slice_end - jiffies; - cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); - } - - cfq_resort_rr_list(cfqd, cfqq); - - if (cfqq == cfqd->active_queue) - cfqd->active_queue = NULL; - - if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->ioc); - cfqd->active_cic = NULL; - } + elv_ioq_slice_expired(cfqd->queue, cfqq->ioq); } -static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) +static inline void cfq_slice_expired(struct cfq_data *cfqd) { - struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_queue *cfqq = elv_active_sched_queue(cfqd->queue->elevator); if (cfqq) - __cfq_slice_expired(cfqd, cfqq, timed_out); -} - -/* - * Get next queue for service. Unless we have a queue preemption, - * we'll simply select the first cfqq in the service tree. - */ -static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) -{ - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) - return NULL; - - return cfq_rb_first(&cfqd->service_tree); -} - -/* - * Get and set a new active queue for service. - */ -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (!cfqq) { - cfqq = cfq_get_next_queue(cfqd); - if (cfqq) - cfq_clear_cfqq_coop(cfqq); - } - - __cfq_set_active_queue(cfqd, cfqq); - return cfqq; + __cfq_slice_expired(cfqd, cfqq); } static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, @@ -1024,11 +752,11 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid * assumption. */ -static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq, - int probe) +static struct io_queue *cfq_close_cooperator(struct request_queue *q, + void *cur_sched_queue) { - struct cfq_queue *cfqq; + struct cfq_queue *cur_cfqq = cur_sched_queue, *cfqq; + struct cfq_data *cfqd = q->elevator->elevator_data; /* * A valid cfq_io_context is necessary to compare requests against @@ -1049,14 +777,13 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (cfq_cfqq_coop(cfqq)) return NULL; - if (!probe) - cfq_mark_cfqq_coop(cfqq); - return cfqq; + return cfqq->ioq; } -static void cfq_arm_slice_timer(struct cfq_data *cfqd) +static void cfq_arm_slice_timer(struct request_queue *q, void *sched_queue) { - struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = sched_queue; struct cfq_io_context *cic; unsigned long sl; @@ -1069,18 +796,18 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); - WARN_ON(cfq_cfqq_slice_new(cfqq)); + WARN_ON(elv_ioq_slice_new(cfqq->ioq)); /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) + if (!cfqd->cfq_slice_idle || !elv_ioq_idle_window(cfqq->ioq)) return; /* * still requests with the driver, don't idle */ - if (cfqd->rq_in_driver) + if (elv_rq_in_driver(q->elevator)) return; /* @@ -1090,7 +817,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) if (!cic || !atomic_read(&cic->ioc->nr_tasks)) return; - cfq_mark_cfqq_wait_request(cfqq); + elv_mark_ioq_wait_request(cfqq->ioq); /* * we don't want to idle for seeks, but we do want to allow @@ -1101,7 +828,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); - mod_timer(&cfqd->idle_slice_timer, jiffies + sl); + elv_mod_idle_slice_timer(q->elevator, jiffies + sl); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); } @@ -1113,10 +840,9 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); - cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfq_log_cfqq(cfqd, cfqq, "dispatch_insert sect=%d", blk_rq_sectors(rq)); cfq_remove_request(rq); - cfqq->dispatched++; elv_dispatch_sort(q, rq); if (cfq_cfqq_sync(cfqq)) @@ -1154,78 +880,11 @@ static inline int cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { const int base_rq = cfqd->cfq_slice_async_rq; + unsigned short ioprio = elv_ioq_ioprio(cfqq->ioq); - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq, *new_cfqq = NULL; - - cfqq = cfqd->active_queue; - if (!cfqq) - goto new_queue; - - /* - * The active queue has run out of time, expire it and select new. - */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) - goto expire; + WARN_ON(ioprio >= IOPRIO_BE_NR); - /* - * If we have a RT cfqq waiting, then we pre-empt the current non-rt - * cfqq. - */ - if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { - /* - * We simulate this as cfqq timed out so that it gets to bank - * the remaining of its time slice. - */ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - goto new_queue; - } - - /* - * The active queue has requests and isn't expired, allow it to - * dispatch. - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto keep_queue; - - /* - * If another queue has a request waiting within our mean seek - * distance, let it run. The expire code will check for close - * cooperators and put the close queue at the front of the service - * tree. - */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); - if (new_cfqq) - goto expire; - - /* - * No requests pending. If the active queue still has requests in - * flight or is idling for a new request, allow either of these - * conditions to happen (or time out) before selecting a new queue. - */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { - cfqq = NULL; - goto keep_queue; - } - -expire: - cfq_slice_expired(cfqd, 0); -new_queue: - cfqq = cfq_set_active_queue(cfqd, new_cfqq); -keep_queue: - return cfqq; + return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - ioprio)); } static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) @@ -1250,12 +909,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = elv_select_sched_queue(cfqd->queue, 1)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); - cfq_slice_expired(cfqd, 0); + /* This probably is redundant now. above loop will should make sure + * that all the busy queues have expired */ + cfq_slice_expired(cfqd); - BUG_ON(cfqd->busy_queues); + BUG_ON(elv_nr_busy_ioq(cfqd->queue->elevator)); cfq_log(cfqd, "forced_dispatch=%d", dispatched); return dispatched; @@ -1301,13 +962,10 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) struct cfq_queue *cfqq; unsigned int max_dispatch; - if (!cfqd->busy_queues) - return 0; - if (unlikely(force)) return cfq_forced_dispatch(cfqd); - cfqq = cfq_select_queue(cfqd); + cfqq = elv_select_sched_queue(q, 0); if (!cfqq) return 0; @@ -1324,7 +982,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) /* * Does this cfqq already have too much IO in flight? */ - if (cfqq->dispatched >= max_dispatch) { + if (elv_ioq_nr_dispatched(cfqq->ioq) >= max_dispatch) { /* * idle queue must always only have a single IO in flight */ @@ -1334,13 +992,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1) + if (elv_nr_busy_ioq(q->elevator) > 1) return 0; /* * we are the only queue, allow up to 4 times of 'quantum' */ - if (cfqq->dispatched >= 4 * max_dispatch) + if (elv_ioq_nr_dispatched(cfqq->ioq) >= 4 * max_dispatch) return 0; } @@ -1349,51 +1007,45 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) */ cfq_dispatch_request(cfqd, cfqq); cfqq->slice_dispatch++; - cfq_clear_cfqq_must_dispatch(cfqq); /* * expire an async queue immediately if it has used up its slice. idle * queue always expire after 1 dispatch round. */ - if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && + if (elv_nr_busy_ioq(q->elevator) > 1 && ((!cfq_cfqq_sync(cfqq) && cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || - cfq_class_idle(cfqq))) { - cfqq->slice_end = jiffies + 1; - cfq_slice_expired(cfqd, 0); + (cfq_class_idle(cfqq) && !elv_iog_should_idle(cfqq->ioq)))) { + cfq_slice_expired(cfqd); } cfq_log(cfqd, "dispatched a request"); return 1; } -/* - * task holds one reference to the queue, dropped when task exits. each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * queue lock must be held here. - */ -static void cfq_put_queue(struct cfq_queue *cfqq) +static void cfq_free_cfq_queue(struct elevator_queue *e, void *sched_queue) { + struct cfq_queue *cfqq = sched_queue; struct cfq_data *cfqd = cfqq->cfqd; - BUG_ON(atomic_read(&cfqq->ref) <= 0); - - if (!atomic_dec_and_test(&cfqq->ref)) - return; + BUG_ON(!cfqq); - cfq_log_cfqq(cfqd, cfqq, "put_queue"); + cfq_log_cfqq(cfqd, cfqq, "free_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + if (unlikely(cfqq_is_active_queue(cfqq))) { + __cfq_slice_expired(cfqd, cfqq); + elv_schedule_dispatch(cfqd->queue); } kmem_cache_free(cfq_pool, cfqq); } +static inline void cfq_put_queue(struct cfq_queue *cfqq) +{ + elv_put_ioq(cfqq->ioq); +} + /* * Must always be called with the rcu_read_lock() held */ @@ -1481,9 +1133,9 @@ static void cfq_free_io_context(struct io_context *ioc) static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + if (unlikely(cfqq == elv_active_sched_queue(cfqd->queue->elevator))) { + __cfq_slice_expired(cfqd, cfqq); + elv_schedule_dispatch(cfqd->queue); } cfq_put_queue(cfqq); @@ -1571,7 +1223,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) { struct task_struct *tsk = current; - int ioprio_class; + int ioprio_class, ioprio; if (!cfq_cfqq_prio_changed(cfqq)) return; @@ -1584,30 +1236,33 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) /* * no prio set, inherit CPU scheduling settings */ - cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = task_nice_ioclass(tsk); + ioprio = task_nice_ioprio(tsk); + ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_RT; + ioprio = task_ioprio(ioc); + ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_BE; + ioprio = task_ioprio(ioc); + ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: - cfqq->ioprio_class = IOPRIO_CLASS_IDLE; - cfqq->ioprio = 7; - cfq_clear_cfqq_idle_window(cfqq); + ioprio_class = IOPRIO_CLASS_IDLE; + ioprio = 7; + elv_clear_ioq_idle_window(cfqq->ioq); break; } + elv_ioq_set_ioprio_class(cfqq->ioq, ioprio_class); + elv_ioq_set_ioprio(cfqq->ioq, ioprio); + /* * keep track of original prio settings in case we have to temporarily * elevate the priority of this queue */ - cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; + cfqq->org_ioprio = ioprio; + cfqq->org_ioprio_class = ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -1623,14 +1278,28 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) spin_lock_irqsave(cfqd->queue->queue_lock, flags); cfqq = cic->cfqq[BLK_RW_ASYNC]; + if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, + + /* + * Drop the reference to old queue unconditionally. Don't + * worry whether new async prio queue has been allocated + * or not. + */ + cic_set_cfqq(cic, NULL, BLK_RW_ASYNC); + cfq_put_queue(cfqq); + + /* + * Why to allocate new queue now? Will it not be automatically + * allocated whenever another async request from same context + * comes? Keeping it for the time being because existing cfq + * code allocates the new queue immediately upon prio change + */ + new_cfqq = cfq_get_queue(cfqd, NULL, BLK_RW_ASYNC, cic->ioc, GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + if (new_cfqq) + cic_set_cfqq(cic, new_cfqq, BLK_RW_ASYNC); } cfqq = cic->cfqq[BLK_RW_SYNC]; @@ -1649,42 +1318,136 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc) static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, pid_t pid, int is_sync) { - RB_CLEAR_NODE(&cfqq->rb_node); RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); - atomic_set(&cfqq->ref, 0); cfqq->cfqd = cfqd; cfq_mark_cfqq_prio_changed(cfqq); if (is_sync) { if (!cfq_class_idle(cfqq)) - cfq_mark_cfqq_idle_window(cfqq); - cfq_mark_cfqq_sync(cfqq); + elv_mark_ioq_idle_window(cfqq->ioq); + elv_mark_ioq_sync(cfqq->ioq); } cfqq->pid = pid; } +#ifdef CONFIG_IOSCHED_CFQ_HIER +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *async_cfqq = cic_to_cfqq(cic, 0); + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + struct io_group *iog, *__iog; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + iog = elv_io_get_io_group(q, NULL, 0); + + if (async_cfqq != NULL) { + __iog = cfqq_to_io_group(async_cfqq); + if (iog != __iog) { + /* cgroup changed, drop the reference to async queue */ + cic_set_cfqq(cic, NULL, 0); + cfq_put_queue(async_cfqq); + } + } + + if (sync_cfqq != NULL) { + __iog = cfqq_to_io_group(sync_cfqq); + + /* + * Drop reference to sync queue. A new sync queue will + * be assigned in new group upon arrival of a fresh request. + * If old queue has got requests, those reuests will be + * dispatched over a period of time and queue will be freed + * automatically. + */ + if (iog != __iog) { + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_IOSCHED_CFQ_HIER */ + static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, +cfq_find_alloc_queue(struct cfq_data *cfqd, struct bio *bio, int is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct request_queue *q = cfqd->queue; + struct io_queue *ioq = NULL, *new_ioq = NULL; + struct io_group *iog = NULL; retry: + iog = elv_io_get_io_group_bio(q, bio, 1); + cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + if (!cfqq && !is_sync) { + const int ioprio = task_ioprio(cic->ioc); + const int ioprio_class = task_ioprio_class(cic->ioc); + + /* + * We have not cached async queue pointer as bio tracking + * is enabled. Look into group async queue array using ioc + * class and prio to see if somebody already allocated the + * queue. + */ + + cfqq = elv_io_group_async_queue_prio(iog, ioprio_class, ioprio); + } +#endif + /* * Always try a new alloc if we fell back to the OOM cfqq * originally, since it should just be a temporary situation. */ if (!cfqq || cfqq == &cfqd->oom_cfqq) { + /* Allocate ioq object first and then cfqq */ + if (new_ioq) { + goto alloc_cfqq; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(cfqd->queue->queue_lock); + new_ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + spin_lock_irq(cfqd->queue->queue_lock); + if (new_ioq) + goto retry; + } else + ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + +alloc_cfqq: + if (!ioq && !new_ioq) { + /* ioq allocation failed. Deafult to oom_cfqq */ + cfqq = &cfqd->oom_cfqq; + goto out; + } + cfqq = NULL; if (new_cfqq) { + ioq = new_ioq; + new_ioq = NULL; cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { @@ -1702,60 +1465,82 @@ retry: } if (cfqq) { + elv_init_ioq(q->elevator, ioq, current->pid, is_sync); + elv_init_ioq_sched_queue(q->elevator, ioq, cfqq); + + cfqq->ioq = ioq; cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + + /* call it after cfq has initialized queue prio */ + elv_init_ioq_io_group(ioq, iog); + /* ioq reference on iog */ + elv_get_iog(iog); cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else + } else { cfqq = &cfqd->oom_cfqq; + /* If ioq allocation was successful, free it up */ + if (ioq) + elv_free_ioq(ioq); + } } + if (new_ioq) + elv_free_ioq(new_ioq); + if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); +out: return cfqq; } -static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; - case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; - default: - BUG(); - } -} - static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, - gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, struct bio *bio, int is_sync, + struct io_context *ioc, gfp_t gfp_mask) { const int ioprio = task_ioprio(ioc); const int ioprio_class = task_ioprio_class(ioc); - struct cfq_queue **async_cfqq = NULL; + struct cfq_queue *async_cfqq = NULL; struct cfq_queue *cfqq = NULL; + struct io_group *iog = elv_io_get_io_group_bio(cfqd->queue, bio, 1); if (!is_sync) { - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); - cfqq = *async_cfqq; + async_cfqq = elv_io_group_async_queue_prio(iog, ioprio_class, + ioprio); + cfqq = async_cfqq; } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, bio, is_sync, ioc, gfp_mask); + if (!is_sync && !async_cfqq) + elv_io_group_set_async_queue(iog, ioprio_class, ioprio, + cfqq->ioq); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT /* - * pin the queue now that it's allocated, scheduler exit will prune it + * ioc reference. If async request queue/group is determined from the + * original task/cgroup and not from submitter task, io context can + * not cache the pointer to async queue and everytime a request comes, + * it will be determined by going through the async queue array. + * + * This comes from the fact that we might be getting async requests + * which belong to a different cgroup altogether than the cgroup + * iocontext belongs to. And this thread might be submitting bios + * from various cgroups. So every time async queue will be different + * based on the cgroup of the bio/rq. Can't cache the async cfqq + * pointer in cic. */ - if (!is_sync && !(*async_cfqq)) { - atomic_inc(&cfqq->ref); - *async_cfqq = cfqq; - } - - atomic_inc(&cfqq->ref); + if (is_sync) + elv_get_ioq(cfqq->ioq); +#else + /* + * async requests are being attributed to task submitting + * it, hence cic can cache async cfqq pointer. Take the + * queue reference even for async queue. + */ + elv_get_ioq(cfqq->ioq); +#endif return cfqq; } @@ -1894,7 +1679,10 @@ out: smp_read_barrier_depends(); if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); - +#ifdef CONFIG_IOSCHED_CFQ_HIER + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -1960,7 +1748,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); + enable_idle = old_idle = elv_ioq_idle_window(cfqq->ioq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) @@ -1975,9 +1763,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (old_idle != enable_idle) { cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); + elv_mark_ioq_idle_window(cfqq->ioq); else - cfq_clear_cfqq_idle_window(cfqq); + elv_clear_ioq_idle_window(cfqq->ioq); } } @@ -1986,16 +1774,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, * no or if we aren't sure, a 1 will cause a preempt. */ static int -cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, - struct request *rq) +cfq_should_preempt(struct request_queue *q, void *new_cfqq, struct request *rq) { - struct cfq_queue *cfqq; + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = elv_active_sched_queue(q->elevator); - cfqq = cfqd->active_queue; if (!cfqq) return 0; - if (cfq_slice_used(cfqq)) + if (elv_ioq_slice_used(cfqq->ioq)) return 1; if (cfq_class_idle(new_cfqq)) @@ -2018,13 +1805,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return 1; - - if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) + if (!cfqd->active_cic || !elv_ioq_wait_request(cfqq->ioq)) return 0; /* @@ -2038,27 +1819,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, } /* - * cfqq preempts the active queue. if we allowed preempt with no slice left, - * let it have half of its nominal slice. - */ -static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - - /* - * Put the new queue at the front of the of the current list, - * so we know that it will be selected next. - */ - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - - cfq_service_tree_add(cfqd, cfqq, 1); - - cfqq->slice_end = 0; - cfq_mark_cfqq_slice_new(cfqq); -} - -/* * Called when a new fs request (rq) is added (to cfqq). Check if there's * something we should do about it */ @@ -2077,36 +1837,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_update_idle_window(cfqd, cfqq, cic); cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (cfqq == cfqd->active_queue) { - /* - * Remember that we saw a request from this process, but - * don't start queuing just yet. Otherwise we risk seeing lots - * of tiny requests, because we disrupt the normal plugging - * and merging. If the request is already larger than a single - * page, let it rip immediately. For that case we assume that - * merging is already done. Ditto for a busy system that - * has other work pending, don't risk delaying until the - * idle timer unplug to continue working. - */ - if (cfq_cfqq_wait_request(cfqq)) { - if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || - cfqd->busy_queues > 1) { - del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); - } - } else if (cfq_should_preempt(cfqd, cfqq, rq)) { - /* - * not the active queue - expire current slice if it is - * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority or - * this new queue is RT and the current one is BE - */ - cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue); - } } static void cfq_insert_request(struct request_queue *q, struct request *rq) @@ -2130,11 +1860,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + struct elevator_queue *eq = cfqd->queue->elevator; + + if (elv_rq_in_driver(eq) > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = elv_rq_in_driver(eq); if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + elv_rq_in_driver(eq) <= CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -2161,44 +1893,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver); - WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; - cfqq->dispatched--; - if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight--; - if (sync) RQ_CIC(rq)->last_end_request = now; - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (cfqd->active_queue == cfqq) { - const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); - - if (cfq_cfqq_slice_new(cfqq)) { - cfq_set_prio_slice(cfqd, cfqq); - cfq_clear_cfqq_slice_new(cfqq); - } - /* - * If there are no requests waiting in this queue, and - * there are other queues ready to issue requests, AND - * those other queues are issuing requests within our - * mean seek distance, give them a chance to run instead - * of idling. - */ - if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && - sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); - } - - if (!cfqd->rq_in_driver) - cfq_schedule_dispatch(cfqd); } /* @@ -2207,29 +1905,32 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) */ static void cfq_prio_boost(struct cfq_queue *cfqq) { + struct io_queue *ioq = cfqq->ioq; + if (has_fs_excl()) { /* * boost idle prio on transactions that would lock out other * users of the filesystem */ if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; + elv_ioq_set_ioprio_class(ioq, IOPRIO_CLASS_BE); + if (elv_ioq_ioprio(ioq) > IOPRIO_NORM) + elv_ioq_set_ioprio(ioq, IOPRIO_NORM); + } else { /* * check if we need to unboost the queue */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; - if (cfqq->ioprio != cfqq->org_ioprio) - cfqq->ioprio = cfqq->org_ioprio; + if (elv_ioq_ioprio_class(ioq) != cfqq->org_ioprio_class) + elv_ioq_set_ioprio_class(ioq, cfqq->org_ioprio_class); + if (elv_ioq_ioprio(ioq) != cfqq->org_ioprio) + elv_ioq_set_ioprio(ioq, cfqq->org_ioprio); } } static inline int __cfq_may_queue(struct cfq_queue *cfqq) { - if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && + if ((elv_ioq_wait_request(cfqq->ioq) || cfq_cfqq_must_alloc(cfqq)) && !cfq_cfqq_must_alloc_slice(cfqq)) { cfq_mark_cfqq_must_alloc_slice(cfqq); return ELV_MQUEUE_MUST; @@ -2282,7 +1983,7 @@ static void cfq_put_request(struct request *rq) put_io_context(RQ_CIC(rq)->ioc); rq->elevator_private = NULL; - rq->elevator_private2 = NULL; + rq->ioq = NULL; cfq_put_queue(cfqq); } @@ -2292,7 +1993,8 @@ static void cfq_put_request(struct request *rq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_context *cic; @@ -2312,125 +2014,37 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); + cfqq = cfq_get_queue(cfqd, bio, is_sync, cic->ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); - atomic_inc(&cfqq->ref); + elv_get_ioq(cfqq->ioq); spin_unlock_irqrestore(q->queue_lock, flags); rq->elevator_private = cic; - rq->elevator_private2 = cfqq; + rq->ioq = cfqq->ioq; return 0; queue_fail: if (cic) put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); + elv_schedule_dispatch(cfqd->queue); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); return 1; } -static void cfq_kick_queue(struct work_struct *work) -{ - struct cfq_data *cfqd = - container_of(work, struct cfq_data, unplug_work); - struct request_queue *q = cfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue); - spin_unlock_irq(q->queue_lock); -} - -/* - * Timer running if the active_queue is currently idling inside its time slice - */ -static void cfq_idle_slice_timer(unsigned long data) -{ - struct cfq_data *cfqd = (struct cfq_data *) data; - struct cfq_queue *cfqq; - unsigned long flags; - int timed_out = 1; - - cfq_log(cfqd, "idle timer fired"); - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - - cfqq = cfqd->active_queue; - if (cfqq) { - timed_out = 0; - - /* - * We saw a request before the queue expired, let it through - */ - if (cfq_cfqq_must_dispatch(cfqq)) - goto out_kick; - - /* - * expired - */ - if (cfq_slice_used(cfqq)) - goto expire; - - /* - * only expire and reinvoke request handler, if there are - * other queues with pending requests - */ - if (!cfqd->busy_queues) - goto out_cont; - - /* - * not expired and it has a request pending, let it dispatch - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto out_kick; - } -expire: - cfq_slice_expired(cfqd, timed_out); -out_kick: - cfq_schedule_dispatch(cfqd); -out_cont: - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -} - -static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) -{ - del_timer_sync(&cfqd->idle_slice_timer); - cancel_work_sync(&cfqd->unplug_work); -} - -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; - cfq_shutdown_timer_wq(cfqd); - spin_lock_irq(q->queue_lock); - if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - while (!list_empty(&cfqd->cic_list)) { struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, struct cfq_io_context, @@ -2439,16 +2053,11 @@ static void cfq_exit_queue(struct elevator_queue *e) __cfq_exit_single_io_context(cfqd, cic); } - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); - - cfq_shutdown_timer_wq(cfqd); - kfree(cfqd); } -static void *cfq_init_queue(struct request_queue *q) +static void *cfq_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct cfq_data *cfqd; int i; @@ -2457,8 +2066,6 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; - /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -2473,25 +2080,20 @@ static void *cfq_init_queue(struct request_queue *q) * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - atomic_inc(&cfqd->oom_cfqq.ref); + + /* Link up oom_ioq and oom_cfqq */ + cfqd->oom_cfqq.ioq = elv_get_oom_ioq(eq); + elv_init_ioq_sched_queue(eq, elv_get_oom_ioq(eq), &cfqd->oom_cfqq); INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; - init_timer(&cfqd->idle_slice_timer); - cfqd->idle_slice_timer.function = cfq_idle_slice_timer; - cfqd->idle_slice_timer.data = (unsigned long) cfqd; - - INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); - cfqd->cfq_quantum = cfq_quantum; cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; cfqd->cfq_back_max = cfq_back_max; cfqd->cfq_back_penalty = cfq_back_penalty; - cfqd->cfq_slice[0] = cfq_slice_async; - cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->hw_tag = 1; @@ -2560,8 +2162,6 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); -SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); -SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); #undef SHOW_FUNCTION @@ -2590,8 +2190,6 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); #undef STORE_FUNCTION @@ -2605,10 +2203,14 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(fifo_expire_async), CFQ_ATTR(back_seek_max), CFQ_ATTR(back_seek_penalty), - CFQ_ATTR(slice_sync), - CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + ELV_ATTR(slice_sync), + ELV_ATTR(slice_async), +#ifdef CONFIG_GROUP_IOSCHED + ELV_ATTR(group_idle), + ELV_ATTR(fairness), +#endif __ATTR_NULL }; @@ -2621,8 +2223,6 @@ static struct elevator_type iosched_cfq = { .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, - .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, @@ -2632,7 +2232,14 @@ static struct elevator_type iosched_cfq = { .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, .trim = cfq_free_io_context, + .elevator_free_sched_queue_fn = cfq_free_cfq_queue, + .elevator_active_ioq_set_fn = cfq_active_ioq_set, + .elevator_active_ioq_reset_fn = cfq_active_ioq_reset, + .elevator_arm_slice_timer_fn = cfq_arm_slice_timer, + .elevator_should_preempt_fn = cfq_should_preempt, + .elevator_close_cooperator_fn = cfq_close_cooperator, }, + .elevator_features = ELV_IOSCHED_NEED_FQ, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", .elevator_owner = THIS_MODULE, @@ -2640,14 +2247,6 @@ static struct elevator_type iosched_cfq = { static int __init cfq_init(void) { - /* - * could be 0 on HZ < 1000 setups - */ - if (!cfq_slice_async) - cfq_slice_async = 1; - if (!cfq_slice_idle) - cfq_slice_idle = 1; - if (cfq_slab_setup()) return -ENOMEM; diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b547cbc..cc9c8c3 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -13,6 +13,7 @@ #include #include #include +#include "elevator-fq.h" /* * See Documentation/block/deadline-iosched.txt @@ -23,25 +24,23 @@ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ -struct deadline_data { - /* - * run time data - */ - +struct deadline_queue { /* * requests (deadline_rq s) are present on both sort_list and fifo_list */ - struct rb_root sort_list[2]; + struct rb_root sort_list[2]; struct list_head fifo_list[2]; - /* * next in sort order. read, write or both are NULL */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ +}; +struct deadline_data { + struct request_queue *q; + sector_t last_sector; /* head position */ /* * settings that change how the i/o scheduler behaves */ @@ -56,7 +55,9 @@ static void deadline_move_request(struct deadline_data *, struct request *); static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); + + return &dq->sort_list[rq_data_dir(rq)]; } /* @@ -87,9 +88,10 @@ static inline void deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); + if (dq->next_rq[data_dir] == rq) + dq->next_rq[data_dir] = deadline_latter_request(rq); elv_rb_del(deadline_rb_root(dd, rq), rq); } @@ -102,6 +104,7 @@ deadline_add_request(struct request_queue *q, struct request *rq) { struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(q, rq); deadline_add_rq_rb(dd, rq); @@ -109,7 +112,7 @@ deadline_add_request(struct request_queue *q, struct request *rq) * set expire time and add to fifo list */ rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &dq->fifo_list[data_dir]); } /* @@ -129,6 +132,11 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) struct deadline_data *dd = q->elevator->elevator_data; struct request *__rq; int ret; + struct deadline_queue *dq; + + dq = elv_get_sched_queue_bio(q, bio); + if (!dq) + return ELEVATOR_NO_MERGE; /* * check for front merge @@ -136,7 +144,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) if (dd->front_merges) { sector_t sector = bio->bi_sector + bio_sectors(bio); - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&dq->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); @@ -207,10 +215,11 @@ static void deadline_move_request(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); + dq->next_rq[READ] = NULL; + dq->next_rq[WRITE] = NULL; + dq->next_rq[data_dir] = deadline_latter_request(rq); dd->last_sector = rq_end_sector(rq); @@ -225,9 +234,9 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline int deadline_check_fifo(struct deadline_queue *dq, int ddir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(dq->fifo_list[ddir].next); /* * rq is expired! @@ -245,20 +254,26 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) static int deadline_dispatch_requests(struct request_queue *q, int force) { struct deadline_data *dd = q->elevator->elevator_data; - const int reads = !list_empty(&dd->fifo_list[READ]); - const int writes = !list_empty(&dd->fifo_list[WRITE]); + struct deadline_queue *dq = elv_select_sched_queue(q, force); + int reads, writes; struct request *rq; int data_dir; + if (!dq) + return 0; + + reads = !list_empty(&dq->fifo_list[READ]); + writes = !list_empty(&dq->fifo_list[WRITE]); + /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; + if (dq->next_rq[WRITE]) + rq = dq->next_rq[WRITE]; else - rq = dd->next_rq[READ]; + rq = dq->next_rq[READ]; - if (rq && dd->batching < dd->fifo_batch) + if (rq && dq->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -268,9 +283,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) */ if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + BUG_ON(RB_EMPTY_ROOT(&dq->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (writes && (dq->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -284,9 +299,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) if (writes) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&dq->sort_list[WRITE])); - dd->starved = 0; + dq->starved = 0; data_dir = WRITE; @@ -299,55 +314,70 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + if (deadline_check_fifo(dq, data_dir) || !dq->next_rq[data_dir]) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(dq->fifo_list[data_dir].next); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = dq->next_rq[data_dir]; } - dd->batching = 0; + dq->batching = 0; dispatch_request: /* * rq is the selected appropriate request. */ - dd->batching++; + dq->batching++; deadline_move_request(dd, rq); return 1; } -static int deadline_queue_empty(struct request_queue *q) +static void *deadline_alloc_deadline_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) { - struct deadline_data *dd = q->elevator->elevator_data; + struct deadline_queue *dq; - return list_empty(&dd->fifo_list[WRITE]) - && list_empty(&dd->fifo_list[READ]); + dq = kmalloc_node(sizeof(*dq), gfp_mask | __GFP_ZERO, q->node); + if (dq == NULL) + goto out; + + INIT_LIST_HEAD(&dq->fifo_list[READ]); + INIT_LIST_HEAD(&dq->fifo_list[WRITE]); + dq->sort_list[READ] = RB_ROOT; + dq->sort_list[WRITE] = RB_ROOT; +out: + return dq; +} + +static void deadline_free_deadline_queue(struct elevator_queue *e, + void *sched_queue) +{ + struct deadline_queue *dq = sched_queue; + + kfree(dq); } static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); - kfree(dd); } /* * initialize elevator private data (deadline_data). */ -static void *deadline_init_queue(struct request_queue *q) +static void * +deadline_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; @@ -355,10 +385,7 @@ static void *deadline_init_queue(struct request_queue *q) if (!dd) return NULL; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; + dd->q = q; dd->fifo_expire[READ] = read_expire; dd->fifo_expire[WRITE] = write_expire; dd->writes_starved = writes_starved; @@ -435,6 +462,11 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(fifo_batch), +#ifdef CONFIG_IOSCHED_DEADLINE_HIER + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), +#endif __ATTR_NULL }; @@ -445,13 +477,16 @@ static struct elevator_type iosched_deadline = { .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, - .elevator_queue_empty_fn = deadline_queue_empty, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_init_fn = deadline_init_queue, .elevator_exit_fn = deadline_exit_queue, + .elevator_alloc_sched_queue_fn = deadline_alloc_deadline_queue, + .elevator_free_sched_queue_fn = deadline_free_deadline_queue, }, - +#ifdef CONFIG_IOSCHED_DEADLINE_HIER + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, +#endif .elevator_attrs = deadline_attrs, .elevator_name = "deadline", .elevator_owner = THIS_MODULE, diff --git a/block/elevator-fq.c b/block/elevator-fq.c new file mode 100644 index 0000000..b723c12 --- /dev/null +++ b/block/elevator-fq.c @@ -0,0 +1,3365 @@ +/* + * elevator fair queuing Layer. + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include +#include +#include +#include +#include "elevator-fq.h" + +const int elv_slice_sync = HZ / 10; +int elv_slice_async = HZ / 25; +const int elv_slice_async_rq = 2; +int elv_group_idle = HZ / 125; +static struct kmem_cache *elv_ioq_pool; + +/* + * offset from end of service tree + */ +#define ELV_IDLE_DELAY (HZ / 5) +#define ELV_SLICE_SCALE (500) +#define ELV_SERVICE_SHIFT 20 +#define ELV_HW_QUEUE_MIN (5) +#define ELV_SERVICE_TREE_INIT ((struct io_service_tree) \ + { RB_ROOT, NULL, 0, NULL, 0}) + +#ifdef CONFIG_DEBUG_ELV_FAIR_QUEUING +#define elv_log_entity(entity, fmt, args...) \ +{ \ +{ \ + struct io_queue *ioq = ioq_of(entity); \ + struct io_group *iog = iog_of(entity); \ + \ + if (ioq) { \ + elv_log_ioq(ioq->efqd, ioq, fmt, ##args); \ + } else { \ + elv_log_iog((struct elv_fq_data *)iog->key, iog, fmt, ##args);\ + } \ +} \ +} +#else +#define elv_log_entity(entity, fmt, args...) +#endif + +static inline struct io_queue *ioq_of(struct io_entity *entity) +{ + if (entity->my_sd == NULL) + return container_of(entity, struct io_queue, entity); + return NULL; +} + +static inline int io_entity_class_rt(struct io_entity *entity) +{ + return entity->ioprio_class == IOPRIO_CLASS_RT; +} + +static inline int io_entity_class_idle(struct io_entity *entity) +{ + return entity->ioprio_class == IOPRIO_CLASS_IDLE; +} + +static inline s64 +entity_key(struct io_service_tree *st, struct io_entity *entity) +{ + return entity->vdisktime - st->min_vdisktime; +} + +static inline u64 +elv_delta(u64 service, unsigned int numerator_wt, unsigned int denominator_wt) +{ + if (numerator_wt != denominator_wt) { + service = service * numerator_wt; + do_div(service, denominator_wt); + } + + return service; +} + +static inline u64 elv_delta_fair(unsigned long delta, struct io_entity *entity) +{ + u64 d = delta << ELV_SERVICE_SHIFT; + + return elv_delta(d, IO_WEIGHT_DEFAULT, entity->weight); +} + +static inline int +elv_weight_slice(struct elv_fq_data *efqd, int sync, unsigned int weight) +{ + const int base_slice = efqd->elv_slice[sync]; + + WARN_ON(weight > IO_WEIGHT_MAX); + + return elv_delta(base_slice, weight, IO_WEIGHT_DEFAULT); +} + +static inline int +elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq) +{ + return elv_weight_slice(efqd, elv_ioq_sync(ioq), ioq->entity.weight); +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct io_service_tree *st) +{ + u64 vdisktime = st->min_vdisktime; + + if (st->active_entity) + vdisktime = st->active_entity->vdisktime; + + if (st->rb_leftmost) { + struct io_entity *entity = rb_entry(st->rb_leftmost, + struct io_entity, rb_node); + + if (!st->active_entity) + vdisktime = entity->vdisktime; + else + vdisktime = min_vdisktime(vdisktime, entity->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +static inline struct io_entity *parent_entity(struct io_entity *entity) +{ + return entity->parent; +} + +static inline struct io_group *iog_of(struct io_entity *entity) +{ + if (entity->my_sd) + return container_of(entity, struct io_group, entity); + return NULL; +} + +#ifdef CONFIG_GROUP_IOSCHED +/* check for entity->parent so that loop is not executed for root entity. */ +#define for_each_entity(entity) \ + for (; entity && entity->parent; entity = entity->parent) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + if (parent_entity(entity) == parent_entity(new_entity)) + return 1; + + return 0; +} + +/* return depth at which a io entity is present in the hierarchy */ +static inline int depth_entity(struct io_entity *entity) +{ + int depth = 0; + + for_each_entity(entity) + depth++; + + return depth; +} + +static void find_matching_io_entity(struct io_entity **entity, + struct io_entity **new_entity) +{ + int entity_depth, new_entity_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same group i.e who have a common parent. Walk up the hierarchy of + * both entities until we find their ancestors who are siblings of + * common parent. + */ + + /* First walk up until both entities are at same depth */ + entity_depth = depth_entity(*entity); + new_entity_depth = depth_entity(*new_entity); + + while (entity_depth > new_entity_depth) { + entity_depth--; + *entity = parent_entity(*entity); + } + + while (new_entity_depth > entity_depth) { + new_entity_depth--; + *new_entity = parent_entity(*new_entity); + } + + while (!is_same_group(*entity, *new_entity)) { + *entity = parent_entity(*entity); + *new_entity = parent_entity(*new_entity); + } +} +struct io_group *ioq_to_io_group(struct io_queue *ioq) +{ + return iog_of(parent_entity(&ioq->entity)); +} +EXPORT_SYMBOL(ioq_to_io_group); + +static inline struct io_sched_data * +io_entity_sched_data(struct io_entity *entity) +{ + return &iog_of(parent_entity(entity))->sched_data; +} + +#else /* GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +static void find_matching_io_entity(struct io_entity **entity, + struct io_entity **new_entity) { } + +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + return 1; +} + +static inline struct elv_fq_data *efqd_of(struct io_entity *entity) +{ + return ioq_of(entity)->efqd; +} + +struct io_group *ioq_to_io_group(struct io_queue *ioq) +{ + return ioq->efqd->root_group; +} +EXPORT_SYMBOL(ioq_to_io_group); + +static inline struct io_sched_data * +io_entity_sched_data(struct io_entity *entity) +{ + struct elv_fq_data *efqd = efqd_of(entity); + + return &efqd->root_group->sched_data; +} +#endif /* GROUP_IOSCHED */ + +static inline void +init_io_entity_service_tree(struct io_entity *entity, struct io_entity *parent) +{ + struct io_group *parent_iog = iog_of(parent); + unsigned short idx = entity->ioprio_class - 1; + + BUG_ON(idx >= IO_IOPRIO_CLASSES); + + entity->st = &parent_iog->sched_data.service_tree[idx]; +} + +/* + * Returns the number of active entities a particular io group has. This + * includes number of active entities on service trees as well as the active + * entity which is being served currently, if any. + */ + +static inline int elv_iog_nr_active(struct io_group *iog) +{ + return iog->sched_data.nr_active; +} + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +static void io_group_path(struct io_group *iog) +{ + unsigned short id = iog->iocg_id; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (!id) + goto out; + + css = css_lookup(&io_subsys, id); + if (!css) + goto out; + + if (!css_tryget(css)) + goto out; + + cgroup_path(css->cgroup, iog->path, sizeof(iog->path)); + + css_put(css); + + rcu_read_unlock(); + return; +out: + rcu_read_unlock(); + iog->path[0] = '\0'; + return; +} + +static inline void debug_update_stats_enqueue(struct io_entity *entity) +{ + struct io_group *iog = iog_of(entity); + + if (iog) { + struct elv_fq_data *efqd; + + /* + * Keep track of how many times a group has been added + * to active tree. + */ + iog->queue++; + + rcu_read_lock(); + efqd = rcu_dereference(iog->key); + if (efqd) + elv_log_iog(efqd, iog, "add group weight=%u", + iog->entity.weight); + rcu_read_unlock(); + } +} + +static inline void debug_update_stats_dequeue(struct io_entity *entity) +{ + struct io_group *iog = iog_of(entity); + + if (iog) { + struct elv_fq_data *efqd; + + iog->dequeue++; + rcu_read_lock(); + efqd = rcu_dereference(iog->key); + if (efqd) + elv_log_iog(efqd, iog, "del group weight=%u", + iog->entity.weight); + rcu_read_unlock(); + } +} + +static inline void print_ioq_service_stats(struct io_queue *ioq) +{ + struct io_group *iog = ioq_to_io_group(ioq); + + elv_log_ioq(ioq->efqd, ioq, "service: QTt=%lu QTs=%lu GTt=%lu GTs=%lu", + ioq->entity.total_time, ioq->entity.total_sectors, + iog->entity.total_time, iog->entity.total_sectors); +} + +#else /* DEBUG_GROUP_IOSCHED */ +static inline void io_group_path(struct io_group *iog) {} +static inline void print_ioq_service_stats(struct io_queue *ioq) {} +static inline void debug_update_stats_enqueue(struct io_entity *entity) {} +static inline void debug_update_stats_dequeue(struct io_entity *entity) {} +#endif /* DEBUG_GROUP_IOSCHED */ + +#ifdef CONFIG_DEBUG_ELV_FAIR_QUEUING +static inline void debug_entity_vdisktime(struct io_entity *entity, + unsigned long served, u64 delta) +{ + struct elv_fq_data *efqd; + struct io_group *iog; + + elv_log_entity(entity, "vdisktime=%llu service=%lu delta=%llu" + " entity->weight=%u", entity->vdisktime, + served, delta, entity->weight); + + iog = iog_of(parent_entity(entity)); + efqd = iog->key; + elv_log_iog(efqd, iog, "min_vdisktime=%llu", entity->st->min_vdisktime); +} +#else /* DEBUG_ELV_FAIR_QUEUING */ +static inline void debug_entity_vdisktime(struct io_entity *entity, + unsigned long served, u64 delta) {} +#endif /* DEBUG_ELV_FAIR_QUEUING */ + +static void +entity_served(struct io_entity *entity, unsigned long served, + unsigned long nr_sectors) +{ + for_each_entity(entity) { + u64 delta; + + delta = elv_delta_fair(served, entity); + entity->vdisktime += delta; + update_min_vdisktime(entity->st); + entity->total_time += served; + entity->total_sectors += nr_sectors; + debug_entity_vdisktime(entity, served, delta); + } +} + +static void place_entity(struct io_service_tree *st, struct io_entity *entity, + int add_front) +{ + u64 vdisktime = st->min_vdisktime; + struct rb_node *parent; + struct io_entity *entry; + int nr_active = st->nr_active - 1; + + /* + * Currently put entity at the end of last entity. This probably will + * require adjustments as we move along + */ + if (io_entity_class_idle(entity)) { + vdisktime = elv_delta_fair(ELV_IDLE_DELAY, entity); + parent = rb_last(&st->active); + if (parent) { + entry = rb_entry(parent, struct io_entity, rb_node); + vdisktime += entry->vdisktime; + } + } else if (!add_front && nr_active) { + parent = rb_last(&st->active); + if (parent) { + entry = rb_entry(parent, struct io_entity, rb_node); + vdisktime = entry->vdisktime; + } + } else + vdisktime = st->min_vdisktime; + + entity->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); + elv_log_entity(entity, "place_entity: vdisktime=%llu" + " min_vdisktime=%llu", entity->vdisktime, + st->min_vdisktime); +} + +static inline void io_entity_update_prio(struct io_entity *entity) +{ + if (unlikely(entity->ioprio_changed)) { + /* + * Re-initialize the service tree as ioprio class of the + * entity might have changed. + */ + init_io_entity_service_tree(entity, parent_entity(entity)); + entity->ioprio_changed = 0; + } +} + +static void +__dequeue_io_entity(struct io_service_tree *st, struct io_entity *entity) +{ + /* + * This can happen when during put_prev_io_entity, we detect that ioprio + * of the queue has changed and decide to dequeue_entity() and requeue + * back. In this case entity is on service tree but has already been + * removed from rb tree. + */ + if (RB_EMPTY_NODE(&entity->rb_node)) + return; + + if (st->rb_leftmost == &entity->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&entity->rb_node); + st->rb_leftmost = next_node; + } + + rb_erase(&entity->rb_node, &st->active); + RB_CLEAR_NODE(&entity->rb_node); +} + +static void dequeue_io_entity(struct io_entity *entity) +{ + struct io_service_tree *st = entity->st; + struct io_sched_data *sd = io_entity_sched_data(entity); + + __dequeue_io_entity(st, entity); + entity->on_st = 0; + st->nr_active--; + sd->nr_active--; + debug_update_stats_dequeue(entity); +} + +static void +__enqueue_io_entity(struct io_service_tree *st, struct io_entity *entity, + int add_front) +{ + struct rb_node **node = &st->active.rb_node; + struct rb_node *parent = NULL; + struct io_entity *entry; + s64 key = entity_key(st, entity); + int leftmost = 1; + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct io_entity, rb_node); + + if (key < entity_key(st, entry) || + (add_front && (key == entity_key(st, entry)))) { + node = &parent->rb_left; + } else { + node = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used) + */ + if (leftmost) + st->rb_leftmost = &entity->rb_node; + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, &st->active); +} + +static void enqueue_io_entity(struct io_entity *entity) +{ + struct io_service_tree *st; + struct io_sched_data *sd = io_entity_sched_data(entity); + + io_entity_update_prio(entity); + st = entity->st; + st->nr_active++; + sd->nr_active++; + entity->on_st = 1; + place_entity(st, entity, 0); + __enqueue_io_entity(st, entity, 0); + debug_update_stats_enqueue(entity); +} + +static struct io_entity *__lookup_next_io_entity(struct io_service_tree *st) +{ + struct rb_node *left = st->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct io_entity, rb_node); +} + +static struct io_entity *lookup_next_io_entity(struct io_sched_data *sd) +{ + struct io_service_tree *st = sd->service_tree; + struct io_entity *entity = NULL; + int i; + + BUG_ON(sd->active_entity != NULL); + + if (!sd->nr_active) + return NULL; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) { + entity = __lookup_next_io_entity(st); + if (entity) { + __dequeue_io_entity(st, entity); + st->active_entity = entity; + sd->active_entity = entity; + update_min_vdisktime(entity->st); + break; + } + } + + return entity; +} + +static void requeue_io_entity(struct io_entity *entity, int add_front) +{ + struct io_service_tree *st = entity->st; + struct io_entity *next_entity; + + if (add_front) { + next_entity = __lookup_next_io_entity(st); + + /* + * This is to emulate cfq like functionality where preemption + * can happen with-in same class, like sync queue preempting + * async queue. + * + * This feature is also used by cfq close cooperator + * functionlity where cfq selects a queue out of order to run + * next based on close cooperator. + */ + + if (next_entity && next_entity == entity) + return; + } + + __dequeue_io_entity(st, entity); + place_entity(st, entity, add_front); + __enqueue_io_entity(st, entity, add_front); +} + +/* Requeue and ioq which is already on the tree */ +static void requeue_ioq(struct io_queue *ioq, int add_front) +{ + requeue_io_entity(&ioq->entity, add_front); +} + +static void put_prev_io_entity(struct io_entity *entity) +{ + struct io_service_tree *st = entity->st; + struct io_sched_data *sd = io_entity_sched_data(entity); + + st->active_entity = NULL; + sd->active_entity = NULL; + + if (unlikely(entity->ioprio_changed)) { + dequeue_io_entity(entity); + enqueue_io_entity(entity); + } else + __enqueue_io_entity(st, entity, 0); +} + +/* Put curr ioq back into rb tree. */ +static void put_prev_ioq(struct io_queue *ioq) +{ + struct io_entity *entity = &ioq->entity; + + for_each_entity(entity) { + put_prev_io_entity(entity); + } +} + +static void dequeue_ioq(struct io_queue *ioq) +{ + struct io_entity *entity = &ioq->entity; + + for_each_entity(entity) { + struct io_sched_data *sd = io_entity_sched_data(entity); + + dequeue_io_entity(entity); + /* Don't dequeue parent if it has other entities besides us */ + if (sd->nr_active) + break; + } + elv_put_ioq(ioq); + return; +} + +/* Put a new queue on to the tree */ +static void enqueue_ioq(struct io_queue *ioq) +{ + struct io_entity *entity = &ioq->entity; + + elv_get_ioq(ioq); + + for_each_entity(entity) { + if (entity->on_st) + break; + enqueue_io_entity(entity); + } +} + +static inline void +init_io_entity_parent(struct io_entity *entity, struct io_entity *parent) +{ + entity->parent = parent; + init_io_entity_service_tree(entity, parent); +} + +void elv_put_ioq(struct io_queue *ioq) +{ + struct elv_fq_data *efqd = ioq->efqd; + struct elevator_queue *e = efqd->eq; + struct io_group *iog; + + BUG_ON(atomic_read(&ioq->ref) <= 0); + if (!atomic_dec_and_test(&ioq->ref)) + return; + BUG_ON(ioq->nr_queued); + BUG_ON(elv_ioq_busy(ioq)); + BUG_ON(efqd->active_queue == ioq); + iog = ioq_to_io_group(ioq); + + /* Can be called by outgoing elevator. Don't use q */ + BUG_ON(!e->ops->elevator_free_sched_queue_fn); + e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue); + elv_log_ioq(efqd, ioq, "put_queue"); + elv_free_ioq(ioq); + elv_put_iog(iog); +} +EXPORT_SYMBOL(elv_put_ioq); + +static void elv_ioq_served(struct io_queue *ioq, unsigned long served) +{ + entity_served(&ioq->entity, served, ioq->nr_sectors); + elv_log_ioq(ioq->efqd, ioq, "ioq served: QSt=%lu QSs=%lu qued=%lu", + served, ioq->nr_sectors, ioq->nr_queued); + print_ioq_service_stats(ioq); +} + +/* + * sysfs parts below --> + */ +static ssize_t +elv_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +elv_var_store(unsigned int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtoul(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct elv_fq_data *efqd = e->efqd; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return elv_var_show(__data, (page)); \ +} +SHOW_FUNCTION(elv_group_idle_show, efqd->elv_group_idle, 1); +EXPORT_SYMBOL(elv_group_idle_show); +SHOW_FUNCTION(elv_slice_sync_show, efqd->elv_slice[1], 1); +EXPORT_SYMBOL(elv_slice_sync_show); +SHOW_FUNCTION(elv_slice_async_show, efqd->elv_slice[0], 1); +EXPORT_SYMBOL(elv_slice_async_show); +SHOW_FUNCTION(elv_fairness_show, efqd->fairness, 0); +EXPORT_SYMBOL(elv_fairness_show); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct elv_fq_data *efqd = e->efqd; \ + unsigned int __data; \ + int ret = elv_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(elv_group_idle_store, &efqd->elv_group_idle, 0, UINT_MAX, 1); +EXPORT_SYMBOL(elv_group_idle_store); +STORE_FUNCTION(elv_slice_sync_store, &efqd->elv_slice[1], 1, UINT_MAX, 1); +EXPORT_SYMBOL(elv_slice_sync_store); +STORE_FUNCTION(elv_slice_async_store, &efqd->elv_slice[0], 1, UINT_MAX, 1); +EXPORT_SYMBOL(elv_slice_async_store); +STORE_FUNCTION(elv_fairness_store, &efqd->fairness, 0, 1, 0); +EXPORT_SYMBOL(elv_fairness_store); +#undef STORE_FUNCTION + +void elv_schedule_dispatch(struct request_queue *q) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + + if (elv_nr_busy_ioq(q->elevator)) { + elv_log(efqd, "schedule dispatch"); + kblockd_schedule_work(q, &efqd->unplug_work); + } +} +EXPORT_SYMBOL(elv_schedule_dispatch); + +static void elv_kick_queue(struct work_struct *work) +{ + struct elv_fq_data *efqd = + container_of(work, struct elv_fq_data, unplug_work); + struct request_queue *q = efqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +static void elv_shutdown_timer_wq(struct elevator_queue *e) +{ + del_timer_sync(&e->efqd->idle_slice_timer); + cancel_work_sync(&e->efqd->unplug_work); +} + +static void elv_set_prio_slice(struct elv_fq_data *efqd, struct io_queue *ioq) +{ + ioq->slice_start = jiffies; + ioq->slice_end = elv_prio_to_slice(efqd, ioq) + jiffies; + elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->slice_end - jiffies); +} + +struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask) +{ + struct io_queue *ioq = NULL; + + ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node); + return ioq; +} +EXPORT_SYMBOL(elv_alloc_ioq); + +void elv_free_ioq(struct io_queue *ioq) +{ + kmem_cache_free(elv_ioq_pool, ioq); +} +EXPORT_SYMBOL(elv_free_ioq); + +int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq, pid_t pid, + int is_sync) +{ + RB_CLEAR_NODE(&ioq->entity.rb_node); + atomic_set(&ioq->ref, 0); + ioq->efqd = eq->efqd; + if (elv_iosched_single_ioq(eq)) + ioq->pid = 0; + else + ioq->pid = current->pid; + + elv_ioq_set_ioprio_class(ioq, IOPRIO_CLASS_BE); + elv_ioq_set_ioprio(ioq, IOPRIO_NORM); + + return 0; +} +EXPORT_SYMBOL(elv_init_ioq); + +static void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr) +{ + struct io_queue *ioq = *ioq_ptr; + + if (ioq != NULL) { + /* Drop the reference taken by the io group */ + elv_put_ioq(ioq); + *ioq_ptr = NULL; + } +} + +/* + * Release all the io group references to its async queues. + */ +static void +put_io_group_queues(struct elevator_queue *e, struct io_group *iog) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + elv_release_ioq(e, &iog->async_queue[i][j]); + + /* Free up async idle queue */ + elv_release_ioq(e, &iog->async_idle_queue); + +#ifdef CONFIG_GROUP_IOSCHED + /* Optimization for io schedulers having single ioq */ + if (elv_iosched_single_ioq(e)) + elv_release_ioq(e, &iog->ioq); +#endif +} + +void *elv_io_group_async_queue_prio(struct io_group *iog, int ioprio_class, + int ioprio) +{ + struct io_queue *ioq = NULL; + + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + ioq = iog->async_queue[0][ioprio]; + break; + case IOPRIO_CLASS_BE: + ioq = iog->async_queue[1][ioprio]; + break; + case IOPRIO_CLASS_IDLE: + ioq = iog->async_idle_queue; + break; + default: + BUG(); + } + + if (ioq) + return ioq->sched_queue; + return NULL; +} +EXPORT_SYMBOL(elv_io_group_async_queue_prio); + +void elv_io_group_set_async_queue(struct io_group *iog, int ioprio_class, + int ioprio, struct io_queue *ioq) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + iog->async_queue[0][ioprio] = ioq; + break; + case IOPRIO_CLASS_BE: + iog->async_queue[1][ioprio] = ioq; + break; + case IOPRIO_CLASS_IDLE: + iog->async_idle_queue = ioq; + break; + default: + BUG(); + } + + /* + * Take the group reference and pin the queue. Group exit will + * clean it up + */ + elv_get_ioq(ioq); +} +EXPORT_SYMBOL(elv_io_group_set_async_queue); + +#ifdef CONFIG_GROUP_IOSCHED +static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup); + +static struct io_policy_node *policy_search_node(const struct io_cgroup *iocg, + dev_t dev); +static void +io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog, dev_t dev) +{ + struct io_entity *entity = &iog->entity; + struct io_policy_node *pn; + unsigned long flags; + + spin_lock_irqsave(&iocg->lock, flags); + pn = policy_search_node(iocg, dev); + if (pn) { + entity->weight = pn->weight; + entity->ioprio_class = pn->ioprio_class; + } else { + entity->weight = iocg->weight; + entity->ioprio_class = iocg->ioprio_class; + } + spin_unlock_irqrestore(&iocg->lock, flags); + + entity->ioprio_changed = 1; + entity->my_sd = &iog->sched_data; +} + +/* Check if we plan to idle on the group associated with this queue or not */ +int elv_iog_should_idle(struct io_queue *ioq) +{ + struct io_group *iog = ioq_to_io_group(ioq); + struct elv_fq_data *efqd = ioq->efqd; + + /* + * No idling on group if group idle is disabled or idling is disabled + * for this group. Currently for root group idling is disabled. + */ + if (!efqd->elv_group_idle || !elv_iog_idle_window(iog)) + return 0; + + /* + * If this is last active queue in group with no request queued, we + * need to idle on group before expiring the queue to make sure group + * does not loose its share. + */ + if ((elv_iog_nr_active(iog) <= 1) && !ioq->nr_queued) + return 1; + + return 0; +} +EXPORT_SYMBOL(elv_iog_should_idle); + +static void io_group_set_parent(struct io_group *iog, struct io_group *parent) +{ + struct io_entity *entity = &iog->entity; + + init_io_entity_parent(entity, &parent->entity); + + /* Child group reference on parent group. */ + elv_get_iog(parent); +} + +struct io_cgroup io_root_cgroup = { + .weight = IO_WEIGHT_DEFAULT, + .ioprio_class = IOPRIO_CLASS_BE, +}; + +static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup) +{ + if (!cgroup) + return &io_root_cgroup; + + return container_of(cgroup_subsys_state(cgroup, io_subsys_id), + struct io_cgroup, css); +} + +struct request_list * +elv_get_request_list_bio(struct request_queue *q, struct bio *bio) +{ + struct io_group *iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + iog = q->elevator->efqd->root_group; + else + iog = elv_io_get_io_group_bio(q, bio, 1); + + BUG_ON(!iog); + return &iog->rl; +} + +struct request_list * +elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv) +{ + struct io_group *iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return &q->elevator->efqd->root_group->rl; + + BUG_ON(priv && !rq->ioq); + + if (priv) + iog = ioq_to_io_group(rq->ioq); + else + iog = q->elevator->efqd->root_group; + + BUG_ON(!iog); + return &iog->rl; +} + +/* Set io group congestion on and off thresholds */ +void elv_io_group_congestion_threshold(struct request_queue *q, + struct io_group *iog) +{ + int nr; + + nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1; + if (nr > q->nr_group_requests) + nr = q->nr_group_requests; + iog->nr_congestion_on = nr; + + nr = q->nr_group_requests - (q->nr_group_requests / 8) + - (q->nr_group_requests / 16) - 1; + if (nr < 1) + nr = 1; + iog->nr_congestion_off = nr; +} + +void elv_clear_iog_congested(struct io_group *iog, int sync) +{ + enum io_group_state bit; + + bit = sync ? IOG_sync_congested : IOG_async_congested; + clear_bit(bit, &iog->state); + smp_mb__after_clear_bit(); + congestion_wake_up(sync); +} + +void elv_set_iog_congested(struct io_group *iog, int sync) +{ + enum io_group_state bit; + + bit = sync ? IOG_sync_congested : IOG_async_congested; + set_bit(bit, &iog->state); +} + +static inline int elv_iog_congested(struct io_group *iog, int iog_bits) +{ + return iog->state & iog_bits; +} + +/* Determine if io group page maps to is congested or not */ +int elv_page_io_group_congested(struct request_queue *q, struct page *page, + int sync) +{ + struct io_group *iog; + int ret = 0; + + rcu_read_lock(); + + iog = elv_io_get_io_group(q, page, 0); + + if (!iog) { + /* + * Either cgroup got deleted or this is first request in the + * group and associated io group object has not been created + * yet. Map it to root group. + * + * TODO: Fix the case of group not created yet. + */ + iog = q->elevator->efqd->root_group; + } + + if (sync) + ret = elv_iog_congested(iog, 1 << IOG_sync_congested); + else + ret = elv_iog_congested(iog, 1 << IOG_async_congested); + + if (ret) + elv_log_iog(q->elevator->efqd, iog, "iog congested=%d sync=%d" + " rl.count[sync]=%d nr_group_requests=%d", + ret, sync, iog->rl.count[sync], q->nr_group_requests); + rcu_read_unlock(); + return ret; +} + +static inline int +elv_iog_congestion_on_threshold(struct io_group *iog) +{ + return iog->nr_congestion_on; +} + +static inline int +elv_iog_congestion_off_threshold(struct io_group *iog) +{ + return iog->nr_congestion_off; +} + +void elv_freed_request(struct request_list *rl, int sync) +{ + struct io_group *iog = rl_iog(rl); + + if (iog->rl.count[sync] < elv_iog_congestion_off_threshold(iog)) + elv_clear_iog_congested(iog, sync); +} + +void elv_get_request(struct request_list *rl, int sync) +{ + struct io_group *iog = rl_iog(rl); + + if (iog->rl.count[sync]+1 >= elv_iog_congestion_on_threshold(iog)) + elv_set_iog_congested(iog, sync); +} + +static void iog_nr_requests_updated(struct io_group *iog) +{ + if (iog->rl.count[BLK_RW_SYNC] >= elv_iog_congestion_on_threshold(iog)) + elv_set_iog_congested(iog, BLK_RW_SYNC); + else if (iog->rl.count[BLK_RW_SYNC] < + elv_iog_congestion_off_threshold(iog)) + elv_clear_iog_congested(iog, BLK_RW_SYNC); + + if (iog->rl.count[BLK_RW_ASYNC] >= elv_iog_congestion_on_threshold(iog)) + elv_set_iog_congested(iog, BLK_RW_ASYNC); + else if (iog->rl.count[BLK_RW_ASYNC] < + elv_iog_congestion_off_threshold(iog)) + elv_clear_iog_congested(iog, BLK_RW_ASYNC); +} + +void elv_updated_nr_group_requests(struct request_queue *q) +{ + struct elv_fq_data *efqd; + struct hlist_node *n; + struct io_group *iog; + + efqd = q->elevator->efqd; + + hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) { + elv_io_group_congestion_threshold(q, iog); + iog_nr_requests_updated(iog); + } +} + +/* + * Search the io_group for efqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). + */ +static struct io_group * +io_cgroup_lookup_group(struct io_cgroup *iocg, void *key) +{ + struct io_group *iog; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + __key = rcu_dereference(iog->key); + if (__key == key) + return iog; + } + + return NULL; +} + +static int io_cgroup_policy_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_policy_node *pn; + + iocg = cgroup_to_io_cgroup(cgrp); + + if (list_empty(&iocg->policy_list)) + goto out; + + seq_printf(m, "dev\tweight\tclass\n"); + + spin_lock_irq(&iocg->lock); + list_for_each_entry(pn, &iocg->policy_list, node) { + seq_printf(m, "%u:%u\t%u\t%hu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight, pn->ioprio_class); + } + spin_unlock_irq(&iocg->lock); +out: + return 0; +} + +static inline void policy_insert_node(struct io_cgroup *iocg, + struct io_policy_node *pn) +{ + list_add(&pn->node, &iocg->policy_list); +} + +/* Must be called with iocg->lock held */ +static inline void policy_delete_node(struct io_policy_node *pn) +{ + list_del(&pn->node); +} + +/* Must be called with iocg->lock held */ +static struct io_policy_node *policy_search_node(const struct io_cgroup *iocg, + dev_t dev) +{ + struct io_policy_node *pn; + + if (list_empty(&iocg->policy_list)) + return NULL; + + list_for_each_entry(pn, &iocg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; +} + +static int check_dev_num(dev_t dev) +{ + int part = 0; + struct gendisk *disk; + + disk = get_gendisk(dev, &part); + if (!disk || part) + return -ENODEV; + + return 0; +} + +static int policy_parse_and_set(char *buf, struct io_policy_node *newpn) +{ + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + int ret; + unsigned long major, minor, temp; + int i = 0; + dev_t dev; + + memset(s, 0, sizeof(s)); + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 4) + break; + } + + if (i != 3) + return -EINVAL; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + return -EINVAL; + + minor_s = s[0]; + if (!minor_s) + return -EINVAL; + + ret = strict_strtoul(major_s, 10, &major); + if (ret) + return -EINVAL; + + ret = strict_strtoul(minor_s, 10, &minor); + if (ret) + return -EINVAL; + + dev = MKDEV(major, minor); + + ret = check_dev_num(dev); + if (ret) + return ret; + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[1], 10, &temp); + if (ret || temp > IO_WEIGHT_MAX) + return -EINVAL; + + newpn->weight = temp; + + if (s[2] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[2], 10, &temp); + if (ret || temp < IOPRIO_CLASS_RT || temp > IOPRIO_CLASS_IDLE) + return -EINVAL; + newpn->ioprio_class = temp; + + return 0; +} + +static void update_iog_weight_prio(struct io_group *iog, struct io_cgroup *iocg, + struct io_policy_node *pn) +{ + if (pn->weight) { + iog->entity.weight = pn->weight; + iog->entity.ioprio_class = pn->ioprio_class; + /* + * iog weight and ioprio_class updating actually happens if + * ioprio_changed is set. So ensure ioprio_changed is not set + * until new weight and new ioprio_class are updated. + */ + smp_wmb(); + iog->entity.ioprio_changed = 1; + } else { + iog->entity.weight = iocg->weight; + iog->entity.ioprio_class = iocg->ioprio_class; + + /* The same as above */ + smp_wmb(); + iog->entity.ioprio_changed = 1; + } +} + +static int io_cgroup_policy_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct io_cgroup *iocg; + struct io_policy_node *newpn, *pn; + char *buf; + int ret = 0; + int keep_newpn = 0; + struct hlist_node *n; + struct io_group *iog; + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = policy_parse_and_set(buf, newpn); + if (ret) + goto free_newpn; + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto free_newpn; + } + + iocg = cgroup_to_io_cgroup(cgrp); + spin_lock_irq(&iocg->lock); + + pn = policy_search_node(iocg, newpn->dev); + if (!pn) { + if (newpn->weight != 0) { + policy_insert_node(iocg, newpn); + keep_newpn = 1; + } + goto update_io_group; + } + + if (newpn->weight == 0) { + /* weight == 0 means deleteing a policy */ + policy_delete_node(pn); + goto update_io_group; + } + + pn->weight = newpn->weight; + pn->ioprio_class = newpn->ioprio_class; + +update_io_group: + hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { + if (iog->dev == newpn->dev) + update_iog_weight_prio(iog, iocg, newpn); + } + spin_unlock_irq(&iocg->lock); + + cgroup_unlock(); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct io_cgroup *iocg; \ + u64 ret; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + spin_lock_irq(&iocg->lock); \ + ret = iocg->__VAR; \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int io_cgroup_##__VAR##_write(struct cgroup *cgroup, \ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct io_cgroup *iocg; \ + struct io_group *iog; \ + struct hlist_node *n; \ + struct io_policy_node *pn; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return -EINVAL; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + \ + spin_lock_irq(&iocg->lock); \ + iocg->__VAR = (unsigned long)val; \ + hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { \ + pn = policy_search_node(iocg, iog->dev); \ + if (pn) \ + continue; \ + iog->entity.__VAR = (unsigned long)val; \ + smp_wmb(); \ + iog->entity.ioprio_changed = 1; \ + } \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return 0; \ +} + +STORE_FUNCTION(weight, IO_WEIGHT_MIN, IO_WEIGHT_MAX); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +static int io_cgroup_disk_time_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_group *iog; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + + rcu_read_lock(); + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_time); + } + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +static int io_cgroup_disk_sectors_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_group *iog; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + + rcu_read_lock(); + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_sectors); + } + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +static int io_cgroup_disk_queue_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg = NULL; + struct io_group *iog = NULL; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + rcu_read_lock(); + /* Loop through all the io groups and print statistics */ + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), iog->queue); + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg = NULL; + struct io_group *iog = NULL; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + spin_lock_irq(&iocg->lock); + /* Loop through all the io groups and print statistics */ + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), iog->dequeue); + } + spin_unlock_irq(&iocg->lock); + cgroup_unlock(); + + return 0; +} +#endif + +struct cftype io_files[] = { + { + .name = "policy", + .read_seq_string = io_cgroup_policy_read, + .write_string = io_cgroup_policy_write, + .max_write_len = 256, + }, + { + .name = "weight", + .read_u64 = io_cgroup_weight_read, + .write_u64 = io_cgroup_weight_write, + }, + { + .name = "ioprio_class", + .read_u64 = io_cgroup_ioprio_class_read, + .write_u64 = io_cgroup_ioprio_class_write, + }, + { + .name = "disk_time", + .read_seq_string = io_cgroup_disk_time_read, + }, + { + .name = "disk_sectors", + .read_seq_string = io_cgroup_disk_sectors_read, + }, +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + .name = "disk_queue", + .read_seq_string = io_cgroup_disk_queue_read, + }, + { + .name = "disk_dequeue", + .read_seq_string = io_cgroup_disk_dequeue_read, + }, +#endif +}; + +static int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, io_files, ARRAY_SIZE(io_files)); +} + +static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, + struct cgroup *cgroup) +{ + struct io_cgroup *iocg; + + if (cgroup->parent != NULL) { + iocg = kzalloc(sizeof(*iocg), GFP_KERNEL); + if (iocg == NULL) + return ERR_PTR(-ENOMEM); + } else + iocg = &io_root_cgroup; + + spin_lock_init(&iocg->lock); + INIT_HLIST_HEAD(&iocg->group_data); + iocg->weight = IO_WEIGHT_DEFAULT; + iocg->ioprio_class = IOPRIO_CLASS_BE; + INIT_LIST_HEAD(&iocg->policy_list); + + return &iocg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int iocg_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct task_struct *tsk) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too young or + * exiting: if it has still no ioc the ioc can't be shared, + * if the task is exiting the attach will fail anyway, no + * matter what we return here. + */ + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys io_subsys = { + .name = "io", + .create = iocg_create, + .can_attach = iocg_can_attach, + .attach = iocg_attach, + .destroy = iocg_destroy, + .populate = iocg_populate, + .subsys_id = io_subsys_id, + .use_id = 1, +}; + +static inline unsigned int iog_weight(struct io_group *iog) +{ + return iog->entity.weight; +} + +static struct io_group * +io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup) +{ + struct io_cgroup *iocg; + struct io_group *iog, *leaf = NULL, *prev = NULL; + gfp_t flags = GFP_ATOMIC | __GFP_ZERO; + unsigned int major, minor; + struct backing_dev_info *bdi = &q->backing_dev_info; + + for (; cgroup != NULL; cgroup = cgroup->parent) { + iocg = cgroup_to_io_cgroup(cgroup); + + iog = io_cgroup_lookup_group(iocg, key); + if (iog != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a io_group for efqd, so we don't + * need any more allocations. + */ + break; + } + + iog = kzalloc_node(sizeof(*iog), flags, q->node); + if (!iog) + goto cleanup; + + iog->iocg_id = css_id(&iocg->css); + + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + iog->dev = MKDEV(major, minor); + + io_group_init_entity(iocg, iog, iog->dev); + + atomic_set(&iog->ref, 0); + + elv_mark_iog_idle_window(iog); + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + elv_get_iog(iog); + io_group_path(iog); + + blk_init_request_list(&iog->rl); + elv_io_group_congestion_threshold(q, iog); + + if (leaf == NULL) { + leaf = iog; + prev = leaf; + } else { + io_group_set_parent(prev, iog); + /* + * Build a list of allocated nodes using the efqd + * filed, that is still unused and will be initialized + * only after the node will be connected. + */ + prev->key = iog; + prev = iog; + } + } + + return leaf; + +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->key; + kfree(prev); + } + + return NULL; +} + +static void io_group_chain_link(struct request_queue *q, void *key, + struct cgroup *cgroup, struct io_group *leaf, + struct elv_fq_data *efqd) +{ + struct io_cgroup *iocg; + struct io_group *iog, *next, *prev = NULL; + unsigned long flags; + + assert_spin_locked(q->queue_lock); + + for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { + iocg = cgroup_to_io_cgroup(cgroup); + next = leaf->key; + + iog = io_cgroup_lookup_group(iocg, key); + BUG_ON(iog != NULL); + + spin_lock_irqsave(&iocg->lock, flags); + + rcu_assign_pointer(leaf->key, key); + hlist_add_head_rcu(&leaf->group_node, &iocg->group_data); + hlist_add_head(&leaf->elv_data_node, &efqd->group_list); + + spin_unlock_irqrestore(&iocg->lock, flags); + + prev = leaf; + leaf = next; + } + + BUG_ON(cgroup == NULL && leaf != NULL); + + /* + * This connects the topmost element of the allocated chain to the + * parent group. + */ + if (cgroup != NULL && prev != NULL) { + iocg = cgroup_to_io_cgroup(cgroup); + iog = io_cgroup_lookup_group(iocg, key); + io_group_set_parent(prev, iog); + } +} + +static struct io_group *io_find_alloc_group(struct request_queue *q, + struct cgroup *cgroup, struct elv_fq_data *efqd, + int create) +{ + struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup); + struct io_group *iog = NULL; + /* Note: Use efqd as key */ + void *key = efqd; + + /* + * Take a refenrece to css object. Don't want to map a bio to + * a group if it has been marked for deletion + */ + + if (!css_tryget(&iocg->css)) + return iog; + + iog = io_cgroup_lookup_group(iocg, key); + if (iog != NULL || !create) + goto end; + + iog = io_group_chain_alloc(q, key, cgroup); + if (iog != NULL) + io_group_chain_link(q, key, cgroup, iog, efqd); + +end: + css_put(&iocg->css); + return iog; +} + +struct io_group *elv_io_get_io_group_bio(struct request_queue *q, + struct bio *bio, int create) +{ + struct page *page = NULL; + + /* + * Determine the group from task context. Even calls from + * blk_get_request() which don't have any bio info will be mapped + * to the task's group + */ + if (!bio) + goto sync; + + if (bio_barrier(bio)) { + /* + * Map barrier requests to root group. May be more special + * bio cases should come here + */ + return q->elevator->efqd->root_group; + } + +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + /* Map the sync bio to the right group using task context */ + if (elv_bio_sync(bio)) + goto sync; + + /* Determine the group from info stored in page */ + page = bio_iovec_idx(bio, 0)->bv_page; + return elv_io_get_io_group(q, page, create); +#endif + +sync: + return elv_io_get_io_group(q, page, create); +} +EXPORT_SYMBOL(elv_io_get_io_group_bio); + +/* + * Find the io group page belongs to. + * If "create" is set, io group is created if it is not already present. + * + * Note: This function should be called with queue lock held. It returns + * a pointer to io group without taking any reference. That group will + * be around as long as queue lock is not dropped (as group reclaim code + * needs to get hold of queue lock). So if somebody needs to use group + * pointer even after dropping queue lock, take a reference to the group + * before dropping queue lock. + * + * One can call it without queue lock with rcu read lock held for browsing + * through the groups. + */ +struct io_group * +elv_io_get_io_group(struct request_queue *q, struct page *page, int create) +{ + struct cgroup *cgroup; + struct io_group *iog; + struct elv_fq_data *efqd = q->elevator->efqd; + + if (create) + assert_spin_locked(q->queue_lock); + + rcu_read_lock(); + + if (!page) + cgroup = task_cgroup(current, io_subsys_id); + else + cgroup = get_cgroup_from_page(page); + + if (!cgroup) { + iog = efqd->root_group; + goto out; + } + + iog = io_find_alloc_group(q, cgroup, efqd, create); + if (!iog) { + if (create) + iog = efqd->root_group; + else { + /* + * bio merge functions doing lookup don't want to + * map bio to root group by default + */ + iog = NULL; + } + } +out: + rcu_read_unlock(); + return iog; +} +EXPORT_SYMBOL(elv_io_get_io_group); + +static void io_free_root_group(struct elevator_queue *e) +{ + struct io_group *iog = e->efqd->root_group; + struct io_cgroup *iocg = &io_root_cgroup; + + spin_lock_irq(&iocg->lock); + hlist_del_rcu(&iog->group_node); + spin_unlock_irq(&iocg->lock); + + put_io_group_queues(e, iog); + elv_put_iog(iog); +} + +static struct io_group *io_alloc_root_group(struct request_queue *q, + struct elevator_queue *e, void *key) +{ + struct io_group *iog; + struct io_cgroup *iocg = &io_root_cgroup; + int i; + + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node); + if (iog == NULL) + return NULL; + + elv_get_iog(iog); + iog->entity.parent = NULL; + iog->entity.my_sd = &iog->sched_data; + iog->key = key; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT; + + blk_init_request_list(&iog->rl); + elv_io_group_congestion_threshold(q, iog); + spin_lock_irq(&iocg->lock); + rcu_assign_pointer(iog->key, key); + hlist_add_head_rcu(&iog->group_node, &iocg->group_data); + iog->iocg_id = css_id(&iocg->css); + spin_unlock_irq(&iocg->lock); + io_group_path(iog); + + return iog; +} + +static void io_group_free_rcu(struct rcu_head *head) +{ + struct io_group *iog; + + iog = container_of(head, struct io_group, rcu_head); + kfree(iog); +} + +/* + * This cleanup function does the last bit of things to destroy cgroup. + * It should only get called after io_destroy_group has been invoked. + */ +static void io_group_cleanup(struct io_group *iog) +{ + struct io_service_tree *st; + int i; + + BUG_ON(iog->sched_data.active_entity != NULL); + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(st->active_entity != NULL); + } + + /* + * Wait for any rcu readers to exit before freeing up the group. + * Primarily useful when elv_io_get_io_group() is called without queue + * lock to access some group data from bdi_congested_group() path. + */ + call_rcu(&iog->rcu_head, io_group_free_rcu); +} + +void elv_put_iog(struct io_group *iog) +{ + struct io_group *parent_iog = NULL; + struct io_entity *parent; + + BUG_ON(atomic_read(&iog->ref) <= 0); + if (!atomic_dec_and_test(&iog->ref)) + return; + + parent = parent_entity(&iog->entity); + if (parent) + parent_iog = iog_of(parent); + + io_group_cleanup(iog); + + if (parent_iog) + elv_put_iog(parent_iog); +} +EXPORT_SYMBOL(elv_put_iog); + +/* + * After the group is destroyed, no new sync IO should come to the group. + * It might still have pending IOs in some busy queues. It should be able to + * send those IOs down to the disk. The async IOs (due to dirty page writeback) + * would go in the root group queues after this, as the group does not exist + * anymore. + */ +static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog) +{ + hlist_del(&iog->elv_data_node); + put_io_group_queues(efqd->eq, iog); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + elv_put_iog(iog); +} + +static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup); + struct io_group *iog; + struct elv_fq_data *efqd; + unsigned long uninitialized_var(flags); + struct io_policy_node *pn, *pntmp; + + /* + * io groups are linked in two lists. One list is maintained + * in elevator (efqd->group_list) and other is maintained + * per cgroup structure (iocg->group_data). + * + * While a cgroup is being deleted, elevator also might be + * exiting and both might try to cleanup the same io group + * so need to be little careful. + * + * (iocg->group_data) is protected by iocg->lock. To avoid deadlock, + * we can't hold the queue lock while holding iocg->lock. So we first + * remove iog from iocg->group_data under iocg->lock. Whoever removes + * iog from iocg->group_data should call __io_destroy_group to remove + * iog. + */ + + rcu_read_lock(); + +remove_entry: + spin_lock_irqsave(&iocg->lock, flags); + + if (hlist_empty(&iocg->group_data)) { + spin_unlock_irqrestore(&iocg->lock, flags); + goto done; + } + iog = hlist_entry(iocg->group_data.first, struct io_group, + group_node); + efqd = rcu_dereference(iog->key); + hlist_del_rcu(&iog->group_node); + iog->iocg_id = 0; + spin_unlock_irqrestore(&iocg->lock, flags); + + spin_lock_irqsave(efqd->queue->queue_lock, flags); + __io_destroy_group(efqd, iog); + spin_unlock_irqrestore(efqd->queue->queue_lock, flags); + goto remove_entry; + +done: + list_for_each_entry_safe(pn, pntmp, &iocg->policy_list, node) { + policy_delete_node(pn); + kfree(pn); + } + + free_css_id(&io_subsys, &iocg->css); + rcu_read_unlock(); + BUG_ON(!hlist_empty(&iocg->group_data)); + kfree(iocg); +} + +/* + * This functions checks if iog is still in iocg->group_data, and removes it. + * If iog is not in that list, then cgroup destroy path has removed it, and + * we do not need to remove it. + */ +static void +io_group_check_and_destroy(struct elv_fq_data *efqd, struct io_group *iog) +{ + struct io_cgroup *iocg; + unsigned long flags; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + css = css_lookup(&io_subsys, iog->iocg_id); + + if (!css) + goto out; + + iocg = container_of(css, struct io_cgroup, css); + + spin_lock_irqsave(&iocg->lock, flags); + + if (iog->iocg_id) { + hlist_del_rcu(&iog->group_node); + __io_destroy_group(efqd, iog); + } + + spin_unlock_irqrestore(&iocg->lock, flags); +out: + rcu_read_unlock(); +} + +static void release_elv_io_groups(struct elevator_queue *e) +{ + struct hlist_node *pos, *n; + struct io_group *iog; + struct elv_fq_data *efqd = e->efqd; + + hlist_for_each_entry_safe(iog, pos, n, &efqd->group_list, + elv_data_node) { + io_group_check_and_destroy(efqd, iog); + } +} + +/* + * if bio sumbmitting task and rq don't belong to same io_group, it can't + * be merged + */ +int elv_io_group_allow_merge(struct request *rq, struct bio *bio) +{ + struct request_queue *q = rq->q; + struct io_queue *ioq = rq->ioq; + struct io_group *iog, *__iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return 1; + + /* Determine the io group of the bio submitting task */ + iog = elv_io_get_io_group_bio(q, bio, 0); + if (!iog) { + /* May be task belongs to a differet cgroup for which io + * group has not been setup yet. */ + return 0; + } + + /* Determine the io group of the ioq, rq belongs to*/ + __iog = ioq_to_io_group(ioq); + + return (iog == __iog); +} + +/* Sets the single ioq associated with the io group. (noop, deadline, AS) */ +static inline void +elv_io_group_set_ioq(struct io_group *iog, struct io_queue *ioq) +{ + /* io group reference. Will be dropped when group is destroyed. */ + elv_get_ioq(ioq); + iog->ioq = ioq; +} + +/* + * Find/Create the io queue the rq should go in. This is an optimization + * for the io schedulers (noop, deadline and AS) which maintain only single + * io queue per cgroup. In this case common layer can just maintain a + * pointer in group data structure and keeps track of it. + * + * For the io schdulers like cfq, which maintain multiple io queues per + * cgroup, and decide the io queue of request based on process, this + * function is not invoked. + */ +int elv_set_request_ioq(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct elevator_queue *e = q->elevator; + unsigned long flags; + struct io_queue *ioq = NULL, *new_ioq = NULL; + struct io_group *iog; + void *sched_q = NULL, *new_sched_q = NULL; + + if (!elv_iosched_fair_queuing_enabled(e)) + return 0; + + might_sleep_if(gfp_mask & __GFP_WAIT); + spin_lock_irqsave(q->queue_lock, flags); + +retry: + /* Determine the io group request belongs to */ + iog = elv_io_get_io_group_bio(q, bio, 1); + BUG_ON(!iog); + + /* Get the iosched queue */ + ioq = iog->ioq; + if (!ioq) { + /* io queue and sched_queue needs to be allocated */ + BUG_ON(!e->ops->elevator_alloc_sched_queue_fn); + + if (new_ioq) { + goto alloc_sched_q; + } else if (gfp_mask & __GFP_WAIT) { + /* + * Inform the allocator of the fact that we will + * just repeat this allocation if it fails, to allow + * the allocator to do whatever it needs to attempt to + * free memory. + */ + spin_unlock_irq(q->queue_lock); + new_ioq = elv_alloc_ioq(q, gfp_mask | __GFP_NOFAIL + | __GFP_ZERO); + spin_lock_irq(q->queue_lock); + goto retry; + } else { + ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + if (!ioq) + goto queue_fail; + } + +alloc_sched_q: + if (new_sched_q) { + ioq = new_ioq; + new_ioq = NULL; + sched_q = new_sched_q; + new_sched_q = NULL; + } else if (gfp_mask & __GFP_WAIT) { + /* + * Inform the allocator of the fact that we will + * just repeat this allocation if it fails, to allow + * the allocator to do whatever it needs to attempt to + * free memory. + */ + spin_unlock_irq(q->queue_lock); + /* Call io scheduer to create scheduler queue */ + new_sched_q = e->ops->elevator_alloc_sched_queue_fn(q, + e, gfp_mask | __GFP_NOFAIL + | __GFP_ZERO, new_ioq); + spin_lock_irq(q->queue_lock); + goto retry; + } else { + sched_q = e->ops->elevator_alloc_sched_queue_fn(q, e, + gfp_mask | __GFP_ZERO, ioq); + if (!sched_q) { + elv_free_ioq(ioq); + goto queue_fail; + } + } + + elv_init_ioq(e, ioq, current->pid, 1); + elv_init_ioq_io_group(ioq, iog); + elv_init_ioq_sched_queue(e, ioq, sched_q); + + elv_io_group_set_ioq(iog, ioq); + elv_mark_ioq_sync(ioq); + elv_get_iog(iog); + } + + if (new_sched_q) + e->ops->elevator_free_sched_queue_fn(q->elevator, new_sched_q); + + if (new_ioq) + elv_free_ioq(new_ioq); + + /* Request reference */ + elv_get_ioq(ioq); + rq->ioq = ioq; + spin_unlock_irqrestore(q->queue_lock, flags); + return 0; + +queue_fail: + WARN_ON((gfp_mask & __GFP_WAIT) && !ioq); + elv_schedule_dispatch(q); + spin_unlock_irqrestore(q->queue_lock, flags); + return 1; +} + +/* + * Find out the io queue of bio belongs to. Optimization for single ioq + * per io group io schedulers. + */ +struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, struct bio *bio) +{ + struct io_group *iog; + + /* Determine the io group and io queue of the bio submitting task */ + iog = elv_io_get_io_group_bio(q, bio, 0); + if (!iog) { + /* + * May be bio belongs to a cgroup for which io group has + * not been setup yet. + */ + return NULL; + } + return iog->ioq; +} + +/* + * This request has been serviced. Clean up ioq info and drop the reference. + * Again this is called only for single queue per cgroup schedulers (noop, + * deadline, AS). + */ +void elv_reset_request_ioq(struct request_queue *q, struct request *rq) +{ + struct io_queue *ioq = rq->ioq; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + if (ioq) { + rq->ioq = NULL; + elv_put_ioq(ioq); + } +} + +static inline int is_only_root_group(void) +{ + if (list_empty(&io_root_cgroup.css.cgroup->children)) + return 1; + + return 0; +} + +#else /* CONFIG_GROUP_IOSCHED */ + +static inline unsigned int iog_weight(struct io_group *iog) { return 0; } +static inline void release_elv_io_groups(struct elevator_queue *e) {} + +static struct io_group *io_alloc_root_group(struct request_queue *q, + struct elevator_queue *e, void *key) +{ + struct io_group *iog; + int i; + + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node); + if (iog == NULL) + return NULL; + + iog->entity.parent = NULL; + iog->entity.my_sd = &iog->sched_data; + iog->key = key; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + iog->sched_data.service_tree[i] = ELV_SERVICE_TREE_INIT; + + return iog; +} + +static void io_free_root_group(struct elevator_queue *e) +{ + struct io_group *iog = e->efqd->root_group; + + put_io_group_queues(e, iog); + kfree(iog); +} + +/* No group idling in flat mode */ +int elv_iog_should_idle(struct io_queue *ioq) { return 0; } +EXPORT_SYMBOL(elv_iog_should_idle); + +static inline int is_only_root_group(void) +{ + return 1; +} + +#endif /* CONFIG_GROUP_IOSCHED */ + +/* + * Should be called after ioq prio and class has been initialized as prio + * class data will be used to determine which service tree in the group + * entity should be attached to. + */ +void elv_init_ioq_io_group(struct io_queue *ioq, struct io_group *iog) +{ + init_io_entity_parent(&ioq->entity, &iog->entity); +} +EXPORT_SYMBOL(elv_init_ioq_io_group); + +/* Get next queue for service. */ +static struct io_queue *elv_get_next_ioq(struct request_queue *q) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + struct io_entity *entity = NULL; + struct io_queue *ioq = NULL; + struct io_sched_data *sd; + + BUG_ON(efqd->active_queue != NULL); + + if (!efqd->busy_queues) + return NULL; + + sd = &efqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sd) { + entity = lookup_next_io_entity(sd); + if (!entity) + return NULL; + } + + ioq = ioq_of(entity); + return ioq; +} + +/* + * coop (cooperating queue) tells that io scheduler selected a queue for us + * and we did not select the next queue based on fairness. + */ +static void +__elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq, int coop) +{ + struct request_queue *q = efqd->queue; + struct elevator_queue *eq = q->elevator; + + if (ioq) { + struct io_group *iog = ioq_to_io_group(ioq); + elv_log_ioq(efqd, ioq, "set_active, busy=%d class=%hu prio=%hu" + " weight=%u group_weight=%u qued=%d", + efqd->busy_queues, ioq->entity.ioprio_class, + ioq->entity.ioprio, ioq->entity.weight, + iog_weight(iog), ioq->nr_queued); + print_ioq_service_stats(ioq); + + ioq->slice_start = ioq->slice_end = 0; + ioq->dispatch_start = jiffies; + + elv_clear_ioq_wait_request(ioq); + elv_clear_iog_wait_request(iog); + elv_clear_ioq_must_dispatch(ioq); + elv_clear_iog_wait_busy_done(iog); + elv_mark_ioq_slice_new(ioq); + elv_clear_ioq_must_expire(ioq); + + del_timer(&efqd->idle_slice_timer); + } + + efqd->active_queue = ioq; + + /* Let iosched know if it wants to take some action */ + if (ioq && eq->ops->elevator_active_ioq_set_fn) + eq->ops->elevator_active_ioq_set_fn(q, ioq->sched_queue, coop); +} + +/* Get and set a new active queue for service. */ +static struct +io_queue *elv_set_active_ioq(struct request_queue *q, struct io_queue *ioq) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + int coop = 0; + + if (ioq) { + requeue_ioq(ioq, 1); + /* + * io scheduler selected the next queue for us. Pass this + * this info back to io scheudler. cfq currently uses it + * to reset coop flag on the queue. + */ + coop = 1; + } + + ioq = elv_get_next_ioq(q); + __elv_set_active_ioq(efqd, ioq, coop); + return ioq; +} + +static void elv_reset_active_ioq(struct elv_fq_data *efqd) +{ + struct request_queue *q = efqd->queue; + struct elevator_queue *eq = q->elevator; + struct io_queue *ioq = elv_active_ioq(eq); + + if (eq->ops->elevator_active_ioq_reset_fn) + eq->ops->elevator_active_ioq_reset_fn(q, ioq->sched_queue); + + efqd->active_queue = NULL; + del_timer(&efqd->idle_slice_timer); +} + +/* Called when an inactive queue receives a new request. */ +static void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq) +{ + BUG_ON(elv_ioq_busy(ioq)); + BUG_ON(ioq == efqd->active_queue); + enqueue_ioq(ioq); + elv_mark_ioq_busy(ioq); + efqd->busy_queues++; + elv_log_ioq(efqd, ioq, "add to busy: qued=%d", ioq->nr_queued); + print_ioq_service_stats(ioq); +} + +static void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq) +{ + struct elv_fq_data *efqd = e->efqd; + + BUG_ON(!elv_ioq_busy(ioq)); + BUG_ON(ioq->nr_queued); + elv_log_ioq(efqd, ioq, "del from busy: qued=%d", ioq->nr_queued); + print_ioq_service_stats(ioq); + elv_clear_ioq_busy(ioq); + BUG_ON(efqd->busy_queues == 0); + efqd->busy_queues--; + dequeue_ioq(ioq); +} + +/* + * Call iosched to let that elevator wants to expire the queue. This gives + * iosched like AS to say no (if it is in the middle of batch changeover or + * it is anticipating). it also allows iosched to do some house keeping + * + * if force = 1, it is force dispatch and iosched must clean up its state. + * This is useful when elevator wants to drain iosched and wants to expire + * currnent active queue. + * if slice_expired = 1, ioq slice expired hence elevator fair queuing logic + * wants to switch the queue. iosched should allow that until and unless + * necessary. Currently AS can deny the switch if in the middle of batch switch. + * + * if slice_expired = 0, time slice is still remaining. It is up to the iosched + * whether it wants to wait on this queue or just want to expire it and move + * on to next queue. + */ +static int +elv_iosched_expire_ioq(struct request_queue *q, int slice_expired, int force) +{ + struct elevator_queue *e = q->elevator; + struct io_queue *ioq = elv_active_ioq(q->elevator); + int ret = 1; + + if (e->ops->elevator_expire_ioq_fn) { + ret = e->ops->elevator_expire_ioq_fn(q, ioq->sched_queue, + slice_expired, force); + /* + * AS denied expiration of queue right now. Mark that elevator + * layer has requested ioscheduler (as) to expire this queue. + * Now as will try to expire this queue as soon as it can. + * Now don't try to dispatch from this queue even if we get + * a new request and if time slice is left. Do expire it once. + */ + if (!ret) + elv_mark_ioq_must_expire(ioq); + } + + return ret; +} + +/* + * Do the accounting. Determine how much service (in terms of time slices) + * current queue used and adjust the start, finish time of queue and vtime + * of the tree accordingly. + * + * Determining the service used in terms of time is tricky in certain + * situations. Especially when underlying device supports command queuing + * and requests from multiple queues can be there at same time, then it + * is not clear which queue consumed how much of disk time. + * + * To mitigate this problem, cfq starts the time slice of the queue only + * after first request from the queue has completed. This does not work + * very well if we expire the queue before we wait for first and more + * request to finish from the queue. For seeky queues, we will expire the + * queue after dispatching few requests without waiting and start dispatching + * from next queue. + * + * Currently one should set fairness = 1 to force completion of requests + * from queue before dispatch from next queue starts. This should help in + * better time accounting at the expense of throughput. + */ +void elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + long slice_used = 0, slice_overshoot = 0; + struct io_group *iog = ioq_to_io_group(ioq); + + assert_spin_locked(q->queue_lock); + elv_log_ioq(efqd, ioq, "slice expired"); + + if (elv_ioq_wait_request(ioq) || elv_iog_wait_request(iog) + || elv_iog_wait_busy(iog)) + del_timer(&efqd->idle_slice_timer); + + elv_clear_ioq_wait_request(ioq); + elv_clear_iog_wait_request(iog); + elv_clear_iog_wait_busy(iog); + elv_clear_iog_wait_busy_done(iog); + elv_clear_ioq_must_expire(ioq); + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. Use + * the time elapsed since queue was scheduled in. + */ + if (!ioq->slice_end || ioq->slice_start == jiffies) { + slice_used = jiffies - ioq->dispatch_start; + if (!slice_used) + slice_used = 1; + goto done; + } + + slice_used = jiffies - ioq->slice_start; + if (time_after(jiffies, ioq->slice_end)) + slice_overshoot = jiffies - ioq->slice_end; + +done: + elv_log_ioq(efqd, ioq, "disp_start = %lu sl_start= %lu sl_end=%lu," + " jiffies=%lu", ioq->dispatch_start, ioq->slice_start, + ioq->slice_end, jiffies); + elv_log_ioq(efqd, ioq, "sl_used=%ld, overshoot=%ld sect=%lu", + slice_used, slice_overshoot, ioq->nr_sectors); + elv_ioq_served(ioq, slice_used); + + BUG_ON(ioq != efqd->active_queue); + elv_reset_active_ioq(efqd); + /* Queue is being expired. Reset number of secotrs dispatched */ + ioq->nr_sectors = 0; + + put_prev_ioq(ioq); + + if (!ioq->nr_queued) + elv_del_ioq_busy(q->elevator, ioq); + else if (!elv_ioq_sync(ioq)) { + /* + * Requeue async ioq so that these will be again placed at + * the end of service tree giving a chance to sync queues. + */ + requeue_ioq(ioq, 0); + } +} +EXPORT_SYMBOL(elv_ioq_slice_expired); + +/* Expire the ioq. */ +void elv_slice_expired(struct request_queue *q) +{ + struct io_queue *ioq = elv_active_ioq(q->elevator); + + if (ioq) + elv_ioq_slice_expired(q, ioq); +} + +/* + * Check if new_cfqq should preempt the currently active queue. Return 0 for + * no or if we aren't sure, a 1 will cause a preemption attempt. + */ +static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq, + struct request *rq) +{ + struct io_queue *ioq; + struct elevator_queue *eq = q->elevator; + struct io_entity *entity, *new_entity; + struct io_group *iog = NULL, *new_iog = NULL; + + /* + * Currently only CFQ has preemption logic. Other schedulers don't + * have any notion of preemption across classes or preemption with-in + * class etc. + */ + if (elv_iosched_single_ioq(eq)) + return 0; + + ioq = elv_active_ioq(eq); + + if (!ioq) + return 0; + + entity = &ioq->entity; + new_entity = &new_ioq->entity; + + /* + * In hierarchical setup, one need to traverse up the hierarchy + * till both the queues are children of same parent to make a + * decision whether to do the preemption or not. + */ + find_matching_io_entity(&entity, &new_entity); + + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + + if (new_entity->ioprio_class == IOPRIO_CLASS_RT + && entity->ioprio_class != IOPRIO_CLASS_RT) + return 1; + /* + * Allow an BE request to pre-empt an ongoing IDLE clas timeslice. + */ + + if (new_entity->ioprio_class == IOPRIO_CLASS_BE + && entity->ioprio_class == IOPRIO_CLASS_IDLE) + return 1; + + /* + * If both the queues belong to same group, check with io scheduler + * if it has additional criterion based on which it wants to + * preempt existing queue. + */ + iog = ioq_to_io_group(ioq); + new_iog = ioq_to_io_group(new_ioq); + + if (iog != new_iog) + return 0; + + if (eq->ops->elevator_should_preempt_fn) { + void *sched_queue = elv_ioq_sched_queue(new_ioq); + + return eq->ops->elevator_should_preempt_fn(q, sched_queue, rq); + } + + return 0; +} + +static void elv_preempt_queue(struct request_queue *q, struct io_queue *ioq) +{ + if (elv_iosched_expire_ioq(q, 0, 1)) { + elv_log_ioq(q->elevator->efqd, ioq, "preempt"); + elv_slice_expired(q); + + /* + * Put the new queue at the front of the of the current list, + * so we know that it will be selected next. + */ + + requeue_ioq(ioq, 1); + elv_mark_ioq_slice_new(ioq); + } +} + +void elv_ioq_request_add(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + struct io_queue *ioq = rq->ioq; + struct io_group *iog = ioq_to_io_group(ioq); + int group_wait = 0; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + BUG_ON(!efqd); + BUG_ON(!ioq); + ioq->nr_queued++; + elv_log_ioq(efqd, ioq, "add rq: rq_queued=%d", ioq->nr_queued); + + if (!elv_ioq_busy(ioq)) + elv_add_ioq_busy(efqd, ioq); + + if (elv_iog_wait_request(iog)) { + del_timer(&efqd->idle_slice_timer); + elv_clear_iog_wait_request(iog); + group_wait = 1; + } + + /* + * If we were waiting for a request on this group, wait is + * done. Schedule the next dispatch + */ + if (elv_iog_wait_busy(iog)) { + del_timer(&efqd->idle_slice_timer); + elv_clear_iog_wait_busy(iog); + elv_mark_iog_wait_busy_done(iog); + elv_schedule_dispatch(q); + return; + } + + if (ioq == elv_active_ioq(q->elevator)) { + /* + * Remember that we saw a request from this process, but + * don't start queuing just yet. Otherwise we risk seeing lots + * of tiny requests, because we disrupt the normal plugging + * and merging. If the request is already larger than a single + * page, let it rip immediately. For that case we assume that + * merging is already done. Ditto for a busy system that + * has other work pending, don't risk delaying until the + * idle timer unplug to continue working. + */ + if (group_wait || elv_ioq_wait_request(ioq)) { + del_timer(&efqd->idle_slice_timer); + if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || + efqd->busy_queues > 1 || !blk_queue_plugged(q)) + __blk_run_queue(q); + else + elv_mark_ioq_must_dispatch(ioq); + } + } else if (elv_should_preempt(q, ioq, rq)) { + /* + * not the active queue - expire current slice if it is + * idle and has expired it's mean thinktime or this new queue + * has some old slice time left and is of higher priority or + * this new queue is RT and the current one is BE + */ + elv_preempt_queue(q, ioq); + __blk_run_queue(q); + } else if (group_wait) { + /* + * Got a request in the group we were waiting for. Request + * does not belong to active queue and we have not decided + * to preempt the current active queue. Schedule the dispatch. + */ + elv_schedule_dispatch(q); + } +} + +static void elv_idle_slice_timer(unsigned long data) +{ + struct elv_fq_data *efqd = (struct elv_fq_data *)data; + struct io_queue *ioq; + unsigned long flags; + struct request_queue *q = efqd->queue; + + elv_log(efqd, "idle timer fired"); + + spin_lock_irqsave(q->queue_lock, flags); + + ioq = efqd->active_queue; + + if (ioq) { + struct io_group *iog = ioq_to_io_group(ioq); + + elv_clear_iog_wait_request(iog); + + if (elv_iog_wait_busy(iog)) { + elv_clear_iog_wait_busy(iog); + goto expire; + } + + /* + * We saw a request before the queue expired, let it through + */ + if (elv_ioq_must_dispatch(ioq)) + goto out_kick; + + /* + * expired + */ + if (elv_ioq_slice_used(ioq)) + goto expire; + + /* + * only expire and reinvoke request handler, if there are + * other queues with pending requests + */ + if (!elv_nr_busy_ioq(q->elevator)) + goto out_cont; + + /* + * not expired and it has a request pending, let it dispatch + */ + if (ioq->nr_queued) + goto out_kick; + } +expire: + elv_slice_expired(q); +out_kick: + elv_schedule_dispatch(q); +out_cont: + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void elv_ioq_arm_slice_timer(struct request_queue *q) +{ + struct elevator_queue *eq = q->elevator; + struct io_queue *ioq = elv_active_ioq(eq); + + if (eq->ops->elevator_arm_slice_timer_fn) + eq->ops->elevator_arm_slice_timer_fn(q, ioq->sched_queue); +} + +static void elv_iog_arm_slice_timer(struct request_queue *q, + struct io_group *iog, int wait_for_busy) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + unsigned long sl; + + if (!efqd->elv_group_idle || !elv_iog_idle_window(iog)) + return; + /* + * This queue has consumed its time slice. We are waiting only for + * it to become busy before we select next queue for dispatch. + */ + if (wait_for_busy) { + elv_mark_iog_wait_busy(iog); + sl = efqd->elv_group_idle; + mod_timer(&efqd->idle_slice_timer, jiffies + sl); + elv_log_iog(efqd, iog, "arm idle group: %lu wait busy=1", sl); + return; + } + + elv_mark_iog_wait_request(iog); + sl = efqd->elv_group_idle; + mod_timer(&efqd->idle_slice_timer, jiffies + sl); + elv_log_iog(efqd, iog, "arm_idle group: %lu", sl); +} + +/* + * If io scheduler has functionality of keeping track of close cooperator, check + * with it if it has got a closely co-operating queue. + */ +static inline struct io_queue *elv_close_cooperator(struct request_queue *q, + struct io_queue *ioq) +{ + struct elevator_queue *e = q->elevator; + struct io_queue *new_ioq = NULL; + void *sched_queue = ioq->sched_queue; + + if (q->elevator->ops->elevator_close_cooperator_fn) + new_ioq = e->ops->elevator_close_cooperator_fn(q, sched_queue); + + if (new_ioq) + elv_log_ioq(e->efqd, ioq, "cooperating ioq=%d", new_ioq->pid); + + /* Only select co-operating queue if it belongs to same group as ioq */ + if (new_ioq && !is_same_group(&ioq->entity, &new_ioq->entity)) + return NULL; + + return new_ioq; +} + +/* Common layer function to select the next queue to dispatch from */ +void *elv_select_ioq(struct request_queue *q, int force) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + struct io_queue *new_ioq = NULL, *ioq = elv_active_ioq(q->elevator); + struct io_group *iog; + struct elevator_type *e = q->elevator->elevator_type; + int slice_expired = 1; + + if (!elv_nr_busy_ioq(q->elevator)) + return NULL; + + if (ioq == NULL) + goto new_queue; + + iog = ioq_to_io_group(ioq); + + /* + * Force dispatch. Continue to dispatch from current queue as long + * as it has requests. + */ + if (unlikely(force)) { + if (ioq->nr_queued) + goto keep_queue; + else + goto expire; + } + + /* This queue has been marked for expiry. Try to expire it */ + if (elv_ioq_must_expire(ioq)) + goto expire; + + /* + * If there is only root group present, don't expire the queue for + * single queue ioschedulers (noop, deadline, AS). + */ + + if (is_only_root_group() && elv_iosched_single_ioq(q->elevator)) + goto keep_queue; + + /* We are waiting for this group to become busy before it expires.*/ + if (elv_iog_wait_busy(iog)) { + ioq = NULL; + goto keep_queue; + } + + /* + * The active queue has run out of time, expire it and select new. + */ + if ((elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq)) + && !elv_ioq_must_dispatch(ioq)) { + /* + * Queue has used up its slice. Wait busy is not on otherwise + * we wouldn't have been here. If this group will be deleted + * after the queue expiry, then make sure we have onece + * done wait busy on the group in an attempt to make it + * backlogged. + * + * Following check helps in two conditions. + * - If there are requests dispatched from the queue and + * select_ioq() comes before a request completed from the + * queue and got a chance to arm any of the idle timers. + * + * - If at request completion time slice had not expired and + * we armed either a ioq timer or group timer but when + * select_ioq() hits, slice has expired and it will expire + * the queue without doing busy wait on group. + * + * In similar situations cfq lets delte the queue even if + * idle timer is armed. That does not impact fairness in non + * hierarhical setup due to weighted slice lengths. But in + * hierarchical setup where group slice lengths are derived + * from queue and is not proportional to group's weight, it + * harms the fairness of the group. + */ + if (elv_iog_should_idle(ioq) && !elv_iog_wait_busy_done(iog)) { + ioq = NULL; + goto keep_queue; + } else + goto expire; + } + + /* + * The active queue has requests and isn't expired, allow it to + * dispatch. + */ + + if (ioq->nr_queued) + goto keep_queue; + + /* + * If another queue has a request waiting within our mean seek + * distance, let it run. The expire code will check for close + * cooperators and put the close queue at the front of the service + * tree. + */ + new_ioq = elv_close_cooperator(q, ioq); + if (new_ioq) + goto expire; + + /* + * No requests pending. If the active queue still has requests in + * flight or is idling for a new request, allow either of these + * conditions to happen (or time out) before selecting a new queue. + */ + + if (timer_pending(&efqd->idle_slice_timer) || + (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) { + ioq = NULL; + goto keep_queue; + } + + /* Check for group idling */ + if (elv_iog_should_idle(ioq) && elv_ioq_nr_dispatched(ioq)) { + ioq = NULL; + goto keep_queue; + } + + slice_expired = 0; +expire: + if (efqd->fairness && !force && ioq && ioq->dispatched + && strcmp(e->elevator_name, "anticipatory")) { + /* + * If there are request dispatched from this queue, don't + * dispatch requests from new queue till all the requests from + * this queue have completed. + * + * Anticipatory does not allow queue switch until requests + * from previous queue have finished. + */ + elv_log_ioq(efqd, ioq, "select: wait for requests to finish" + " disp=%lu", ioq->dispatched); + ioq = NULL; + goto keep_queue; + } + + if (elv_iosched_expire_ioq(q, slice_expired, force)) + elv_slice_expired(q); + else + /* + * Not making ioq = NULL, as AS can deny queue expiration and + * continue to dispatch from same queue + */ + goto keep_queue; +new_queue: + ioq = elv_set_active_ioq(q, new_ioq); +keep_queue: + if (ioq) + elv_log_ioq(efqd, ioq, "select busy=%d qued=%d disp=%d", + elv_nr_busy_ioq(q->elevator), ioq->nr_queued, + elv_ioq_nr_dispatched(ioq)); + return ioq; +} + +/* A request got removed from io_queue. Do the accounting */ +void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq) +{ + struct io_queue *ioq; + struct elv_fq_data *efqd; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + ioq = rq->ioq; + BUG_ON(!ioq); + ioq->nr_queued--; + + efqd = ioq->efqd; + BUG_ON(!efqd); +} + +/* A request got dispatched. Do the accounting. */ +void elv_dispatched_request_fair(struct elevator_queue *e, struct request *rq) +{ + struct io_queue *ioq = rq->ioq; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + BUG_ON(!ioq); + ioq->dispatched++; + ioq->nr_sectors += blk_rq_sectors(rq); + elv_ioq_request_removed(e, rq); + elv_clear_ioq_must_dispatch(ioq); +} + +void elv_activate_rq_fair(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + efqd->rq_in_driver++; + elv_log_ioq(efqd, rq->ioq, "activate rq, drv=%d", + efqd->rq_in_driver); +} + +void elv_deactivate_rq_fair(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = q->elevator->efqd; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + WARN_ON(!efqd->rq_in_driver); + efqd->rq_in_driver--; + elv_log_ioq(efqd, rq->ioq, "deactivate rq, drv=%d", + efqd->rq_in_driver); +} + +/* A request got completed from io_queue. Do the accounting. */ +void elv_ioq_completed_request(struct request_queue *q, struct request *rq) +{ + const int sync = rq_is_sync(rq); + struct io_queue *ioq; + struct elv_fq_data *efqd = q->elevator->efqd; + struct io_group *iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + ioq = rq->ioq; + iog = ioq_to_io_group(ioq); + WARN_ON(!efqd->rq_in_driver); + WARN_ON(!ioq->dispatched); + efqd->rq_in_driver--; + ioq->dispatched--; + + elv_log_ioq(efqd, ioq, "complete rq_queued=%d drv=%d disp=%d", + ioq->nr_queued, efqd->rq_in_driver, + elv_ioq_nr_dispatched(ioq)); + /* + * If this is the active queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + + if (elv_active_ioq(q->elevator) == ioq) { + if (elv_ioq_slice_new(ioq)) { + elv_set_prio_slice(q->elevator->efqd, ioq); + elv_clear_ioq_slice_new(ioq); + } + /* + * If there is only root group present, don't expire the queue + * for single queue ioschedulers (noop, deadline, AS). It is + * unnecessary overhead. + */ + + if (is_only_root_group() && + elv_iosched_single_ioq(q->elevator)) { + elv_log_ioq(efqd, ioq, "select: only root group," + " no expiry"); + goto done; + } + + /* + * If there are no requests waiting in this queue, and + * there are other queues ready to issue requests, AND + * those other queues are issuing requests within our + * mean seek distance, give them a chance to run instead + * of idling. + */ + if (elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq)) { + /* + * This is the last empty queue in the group and it + * has consumed its slice. If we expire it right away + * group might loose its share. Wait for an extra + * group_idle period for a request before queue + * expires. + */ + if (elv_iog_should_idle(ioq)) { + elv_iog_arm_slice_timer(q, iog, 1); + goto done; + } + + /* Wait for requests to finish from this queue */ + if (efqd->fairness && elv_ioq_nr_dispatched(ioq)) + goto done; + + /* Expire the queue */ + if (elv_iosched_expire_ioq(q, 1, 0)) { + elv_slice_expired(q); + goto done; + } + } else if (!ioq->nr_queued && !elv_close_cooperator(q, ioq) + && sync && !rq_noidle(rq)) + elv_ioq_arm_slice_timer(q); + /* + * If this is the last queue in the group and we did not + * decide to idle on queue, idle on group. + */ + if (elv_iog_should_idle(ioq) && !elv_ioq_nr_dispatched(ioq) + && !timer_pending(&efqd->idle_slice_timer)) { + /* + * If queue has used up its slice, wait for the + * one extra group_idle period to let the group + * backlogged again. This is to avoid a group loosing + * its fair share. + */ + if (elv_ioq_slice_used(ioq)) + elv_iog_arm_slice_timer(q, iog, 1); + else + elv_iog_arm_slice_timer(q, iog, 0); + } + } +done: + if (!efqd->rq_in_driver) + elv_schedule_dispatch(q); +} + +static void elv_slab_kill(void) +{ + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ + if (elv_ioq_pool) + kmem_cache_destroy(elv_ioq_pool); +} + +static int __init elv_slab_setup(void) +{ + elv_ioq_pool = KMEM_CACHE(io_queue, 0); + if (!elv_ioq_pool) + goto fail; + + return 0; +fail: + elv_slab_kill(); + return -ENOMEM; +} + +struct elv_fq_data * +elv_alloc_fq_data(struct request_queue *q, struct elevator_queue *e) +{ + struct elv_fq_data *efqd = NULL; + + efqd = kmalloc_node(sizeof(*efqd), GFP_KERNEL | __GFP_ZERO, q->node); + return efqd; +} + +void elv_release_fq_data(struct elv_fq_data *efqd) +{ + kfree(efqd); +} + +/* Initialize fair queueing data associated with elevator */ +int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e) +{ + struct io_group *iog; + struct elv_fq_data *efqd = e->efqd; + + if (!elv_iosched_fair_queuing_enabled(e)) + return 0; + + iog = io_alloc_root_group(q, e, efqd); + if (iog == NULL) + return 1; + + efqd->root_group = iog; + + /* + * Our fallback ioq if elv_alloc_ioq() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + elv_init_ioq(e, &efqd->oom_ioq, 1, 0); + elv_get_ioq(&efqd->oom_ioq); + elv_init_ioq_io_group(&efqd->oom_ioq, iog); + + efqd->queue = q; + efqd->eq = e; + + init_timer(&efqd->idle_slice_timer); + efqd->idle_slice_timer.function = elv_idle_slice_timer; + efqd->idle_slice_timer.data = (unsigned long) efqd; + + INIT_WORK(&efqd->unplug_work, elv_kick_queue); + INIT_HLIST_HEAD(&efqd->group_list); + + efqd->elv_slice[0] = elv_slice_async; + efqd->elv_slice[1] = elv_slice_sync; + efqd->elv_group_idle = elv_group_idle; + + return 0; +} + +/* + * elv_exit_fq_data is called before we call elevator_exit_fn. Before + * we ask elevator to cleanup its queues, we do the cleanup here so + * that all the group and idle tree references to ioq are dropped. Later + * during elevator cleanup, ioc reference will be dropped which will lead + * to removal of ioscheduler queue as well as associated ioq object. + */ +void elv_exit_fq_data(struct elevator_queue *e) +{ + struct elv_fq_data *efqd = e->efqd; + struct request_queue *q = efqd->queue; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + elv_shutdown_timer_wq(e); + + spin_lock_irq(q->queue_lock); + release_elv_io_groups(e); + spin_unlock_irq(q->queue_lock); + + elv_shutdown_timer_wq(e); + + /* Wait for iog->key accessors to exit their grace periods. */ + synchronize_rcu(); + + BUG_ON(timer_pending(&efqd->idle_slice_timer)); + io_free_root_group(e); +} + +static int __init elv_fq_init(void) +{ + if (elv_slab_setup()) + return -ENOMEM; + + /* could be 0 on HZ < 1000 setups */ + + if (!elv_slice_async) + elv_slice_async = 1; + + return 0; +} + +module_init(elv_fq_init); diff --git a/block/elevator-fq.h b/block/elevator-fq.h new file mode 100644 index 0000000..d462269 --- /dev/null +++ b/block/elevator-fq.h @@ -0,0 +1,644 @@ +/* + * elevator fair queuing Layer. Data structures and common functions prototypes. + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#ifdef CONFIG_BLOCK +#include +#include + +#ifndef _ELV_SCHED_H +#define _ELV_SCHED_H + +#define IO_WEIGHT_MIN 100 +#define IO_WEIGHT_MAX 1000 +#define IO_WEIGHT_DEFAULT 500 +#define IO_IOPRIO_CLASSES 3 + +#ifdef CONFIG_ELV_FAIR_QUEUING +#define ELV_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, elv_##name##_show, elv_##name##_store) + +struct io_service_tree { + struct rb_root active; + struct io_entity *active_entity; + u64 min_vdisktime; + struct rb_node *rb_leftmost; + unsigned int nr_active; +}; + +struct io_sched_data { + struct io_entity *active_entity; + int nr_active; + struct io_service_tree service_tree[IO_IOPRIO_CLASSES]; +}; + +struct io_entity { + struct rb_node rb_node; + int on_st; + u64 vdisktime; + unsigned int weight; + struct io_entity *parent; + + struct io_sched_data *my_sd; + struct io_service_tree *st; + + unsigned short ioprio, ioprio_class; + int ioprio_changed; + + /* + * Keep track of total service received by this entity. Keep the + * stats both for time slices and number of sectors dispatched + */ + unsigned long total_time; + unsigned long total_sectors; +}; + +/* + * A common structure representing the io queue where requests are actually + * queued. + */ +struct io_queue { + struct io_entity entity; + atomic_t ref; + unsigned int flags; + + /* Pointer to generic elevator fair queuing data structure */ + struct elv_fq_data *efqd; + pid_t pid; + + /* Number of requests queued on this io queue */ + unsigned long nr_queued; + + /* Requests dispatched from this queue */ + int dispatched; + + /* Number of sectors dispatched in current dispatch round */ + unsigned long nr_sectors; + + /* time when dispatch from the queue was started */ + unsigned long dispatch_start; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; + unsigned long slice_end; + + /* Pointer to io scheduler's queue */ + void *sched_queue; +}; + +#ifdef CONFIG_GROUP_IOSCHED /* CONFIG_GROUP_IOSCHED */ + +enum io_group_state { + IOG_async_congested, /* The async queue of group is getting full */ + IOG_sync_congested, /* The sync queue of group is getting full */ + IOG_unused, /* Available bits start here */ +}; + +struct io_group { + struct io_entity entity; + atomic_t ref; + unsigned int flags; + struct io_sched_data sched_data; + struct hlist_node group_node; + struct hlist_node elv_data_node; + unsigned short iocg_id; + /* + * async queue for each priority case for RT and BE class. + * Used only for cfq. + */ + + struct io_queue *async_queue[2][IOPRIO_BE_NR]; + struct io_queue *async_idle_queue; + void *key; + struct rcu_head rcu_head; + + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + /* How many times this group has been added to active tree */ + unsigned long queue; + + /* How many times this group has been removed from active tree */ + unsigned long dequeue; + + /* Store cgroup path */ + char path[128]; +#endif + + /* Single ioq per group, used for noop, deadline, anticipatory */ + struct io_queue *ioq; + + /* io group congestion on and off threshold for request descriptors */ + unsigned int nr_congestion_on; + unsigned int nr_congestion_off; + + unsigned long state; + /* request list associated with the group */ + struct request_list rl; +}; + +struct io_policy_node { + struct list_head node; + dev_t dev; + unsigned int weight; + unsigned short ioprio_class; +}; + +struct io_cgroup { + struct cgroup_subsys_state css; + + unsigned int weight; + unsigned short ioprio_class; + + /* list of io_policy_node */ + struct list_head policy_list; + + spinlock_t lock; + struct hlist_head group_data; +}; + + +#else /* CONFIG_GROUP_IOSCHED */ + +struct io_group { + struct io_entity entity; + struct io_sched_data sched_data; + /* + * async queue for each priority case for RT and BE class. + * Used only for cfq. + */ + + struct io_queue *async_queue[2][IOPRIO_BE_NR]; + struct io_queue *async_idle_queue; + void *key; +}; + +#endif /* CONFIG_GROUP_IOSCHED */ + +struct elv_fq_data { + struct io_group *root_group; + + /* List of io groups hanging on this elevator */ + struct hlist_head group_list; + + struct request_queue *queue; + struct elevator_queue *eq; + unsigned int busy_queues; + + /* Pointer to the ioscheduler queue being served */ + void *active_queue; + + int rq_in_driver; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + unsigned int elv_group_idle; + + /* Base slice length for sync and async queues */ + unsigned int elv_slice[2]; + + /* Fallback dummy ioq for extreme OOM conditions */ + struct io_queue oom_ioq; + + /* + * If set to 1, waits for all request completions from current + * queue before new queue is scheduled in + */ + unsigned int fairness; +}; + +/* Logging facilities. */ +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +#define elv_log_ioq(efqd, ioq, fmt, args...) \ +{ \ + blk_add_trace_msg((efqd)->queue, "elv%d%c %s " fmt, (ioq)->pid, \ + elv_ioq_sync(ioq) ? 'S' : 'A', \ + ioq_to_io_group(ioq)->path, ##args); \ +} + +#define elv_log_iog(efqd, iog, fmt, args...) \ +{ \ + blk_add_trace_msg((efqd)->queue, "elv %s " fmt, (iog)->path, ##args); \ +} + +#else +#define elv_log_ioq(efqd, ioq, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv%d%c " fmt, (ioq)->pid, \ + elv_ioq_sync(ioq) ? 'S' : 'A', ##args) + +#define elv_log_iog(efqd, iog, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args) + +#endif + +#define elv_log(efqd, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args) + +#define ioq_sample_valid(samples) ((samples) > 80) + +/* Some shared queue flag manipulation functions among elevators */ + +enum elv_queue_state_flags { + ELV_QUEUE_FLAG_busy, /* has requests or is under service */ + ELV_QUEUE_FLAG_wait_request, /* waiting for a request */ + ELV_QUEUE_FLAG_must_dispatch, /* must be allowed a dispatch */ + ELV_QUEUE_FLAG_idle_window, /* elevator slice idling enabled */ + ELV_QUEUE_FLAG_slice_new, /* no requests dispatched in slice */ + ELV_QUEUE_FLAG_sync, /* synchronous queue */ + ELV_QUEUE_FLAG_must_expire, /* expire queue even slice is left */ +}; + +#define ELV_IO_QUEUE_FLAG_FNS(name) \ +static inline void elv_mark_ioq_##name(struct io_queue *ioq) \ +{ \ + (ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name); \ +} \ +static inline void elv_clear_ioq_##name(struct io_queue *ioq) \ +{ \ + (ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name); \ +} \ +static inline int elv_ioq_##name(struct io_queue *ioq) \ +{ \ + return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0; \ +} + +ELV_IO_QUEUE_FLAG_FNS(busy) +ELV_IO_QUEUE_FLAG_FNS(wait_request) +ELV_IO_QUEUE_FLAG_FNS(must_dispatch) +ELV_IO_QUEUE_FLAG_FNS(idle_window) +ELV_IO_QUEUE_FLAG_FNS(slice_new) +ELV_IO_QUEUE_FLAG_FNS(sync) +ELV_IO_QUEUE_FLAG_FNS(must_expire) + +#ifdef CONFIG_GROUP_IOSCHED + +enum elv_group_state_flags { + ELV_GROUP_FLAG_idle_window, /* elevator group idling enabled */ + ELV_GROUP_FLAG_wait_request, /* waiting for a request */ + ELV_GROUP_FLAG_wait_busy, /* wait for this queue to get busy */ + ELV_GROUP_FLAG_wait_busy_done, /* Have already waited on this group*/ +}; + +#define ELV_IO_GROUP_FLAG_FNS(name) \ +static inline void elv_mark_iog_##name(struct io_group *iog) \ +{ \ + (iog)->flags |= (1 << ELV_GROUP_FLAG_##name); \ +} \ +static inline void elv_clear_iog_##name(struct io_group *iog) \ +{ \ + (iog)->flags &= ~(1 << ELV_GROUP_FLAG_##name); \ +} \ +static inline int elv_iog_##name(struct io_group *iog) \ +{ \ + return ((iog)->flags & (1 << ELV_GROUP_FLAG_##name)) != 0; \ +} + +#else /* GROUP_IOSCHED */ + +#define ELV_IO_GROUP_FLAG_FNS(name) \ +static inline void elv_mark_iog_##name(struct io_group *iog) {} \ +static inline void elv_clear_iog_##name(struct io_group *iog) {} \ +static inline int elv_iog_##name(struct io_group *iog) { return 0; } +#endif /* GROUP_IOSCHED */ + +ELV_IO_GROUP_FLAG_FNS(idle_window) +ELV_IO_GROUP_FLAG_FNS(wait_request) +ELV_IO_GROUP_FLAG_FNS(wait_busy) +ELV_IO_GROUP_FLAG_FNS(wait_busy_done) + +static inline void elv_get_ioq(struct io_queue *ioq) +{ + atomic_inc(&ioq->ref); +} + +static inline unsigned int elv_ioprio_to_weight(int ioprio) +{ + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + /* Map prio 7 - 0 to weights 200 to 900 */ + return IO_WEIGHT_DEFAULT + (IO_WEIGHT_DEFAULT/5 * (4 - ioprio)); +} + +static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio) +{ + ioq->entity.ioprio = ioprio; + ioq->entity.weight = elv_ioprio_to_weight(ioprio); + ioq->entity.ioprio_changed = 1; +} + +static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq, + int ioprio_class) +{ + ioq->entity.ioprio_class = ioprio_class; + ioq->entity.ioprio_changed = 1; +} + +static inline int elv_ioq_class_idle(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE; +} + +static inline int elv_ioq_class_rt(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class == IOPRIO_CLASS_RT; +} + +static inline int elv_ioq_ioprio_class(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class; +} + +static inline int elv_ioq_ioprio(struct io_queue *ioq) +{ + return ioq->entity.ioprio; +} + +static inline int elv_ioq_slice_used(struct io_queue *ioq) +{ + if (elv_ioq_slice_new(ioq)) + return 0; + if (time_before(jiffies, ioq->slice_end)) + return 0; + + return 1; +} + +/* How many request are currently dispatched from the queue */ +static inline int elv_ioq_nr_dispatched(struct io_queue *ioq) +{ + return ioq->dispatched; +} + +/* How many request are currently queued in the queue */ +static inline int elv_ioq_nr_queued(struct io_queue *ioq) +{ + return ioq->nr_queued; +} + +static inline void *elv_ioq_sched_queue(struct io_queue *ioq) +{ + if (ioq) + return ioq->sched_queue; + return NULL; +} + +static inline struct io_queue *elv_active_ioq(struct elevator_queue *e) +{ + return e->efqd->active_queue; +} + +static inline void *elv_active_sched_queue(struct elevator_queue *e) +{ + return elv_ioq_sched_queue(elv_active_ioq(e)); +} + +static inline int elv_rq_in_driver(struct elevator_queue *e) +{ + return e->efqd->rq_in_driver; +} + +static inline int elv_nr_busy_ioq(struct elevator_queue *e) +{ + return e->efqd->busy_queues; +} + +/* Helper functions for operating on elevator idle slice timer */ +static inline int +elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires) +{ + return mod_timer(&eq->efqd->idle_slice_timer, expires); +} + +static inline int elv_del_idle_slice_timer(struct elevator_queue *eq) +{ + return del_timer(&eq->efqd->idle_slice_timer); +} + +static inline void +elv_init_ioq_sched_queue(struct elevator_queue *eq, struct io_queue *ioq, + void *sched_queue) +{ + ioq->sched_queue = sched_queue; +} + +static inline struct io_queue *elv_get_oom_ioq(struct elevator_queue *eq) +{ + return &eq->efqd->oom_ioq; +} + +#ifdef CONFIG_GROUP_IOSCHED + +extern int elv_io_group_allow_merge(struct request *rq, struct bio *bio); +extern void elv_put_iog(struct io_group *iog); +extern struct io_group *elv_io_get_io_group(struct request_queue *q, + struct page *page, int create); +extern struct io_group *elv_io_get_io_group_bio(struct request_queue *q, + struct bio *bio, int create); +extern ssize_t elv_group_idle_show(struct elevator_queue *q, char *name); +extern ssize_t elv_group_idle_store(struct elevator_queue *q, const char *name, + size_t count); +static inline void elv_get_iog(struct io_group *iog) +{ + atomic_inc(&iog->ref); +} + +static inline struct io_group *rl_iog(struct request_list *rl) +{ + return container_of(rl, struct io_group, rl); +} + +static inline void elv_get_rl_iog(struct request_list *rl) +{ + elv_get_iog(rl_iog(rl)); +} + +static inline void elv_put_rl_iog(struct request_list *rl) +{ + elv_put_iog(rl_iog(rl)); +} + +extern int elv_set_request_ioq(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask); +extern void elv_reset_request_ioq(struct request_queue *q, struct request *rq); +extern struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, + struct bio *bio); +struct request_list * +elv_get_request_list_bio(struct request_queue *q, struct bio *bio); + +struct request_list * +elv_get_request_list_rq(struct request_queue *q, struct request *rq, int priv); +extern int elv_page_io_group_congested(struct request_queue *q, + struct page *page, int sync); +extern void elv_freed_request(struct request_list *rl, int sync); +extern void elv_get_request(struct request_list *rl, int sync); +extern void elv_updated_nr_group_requests(struct request_queue *q); + +#else /* !GROUP_IOSCHED */ + +static inline int elv_io_group_allow_merge(struct request *rq, struct bio *bio) +{ + return 1; +} + +static inline void elv_get_iog(struct io_group *iog) {} +static inline void elv_put_iog(struct io_group *iog) {} + +static inline struct io_group * +elv_io_get_io_group(struct request_queue *q, struct page *page, int create) +{ + /* In flat mode, there is only root group */ + return q->elevator->efqd->root_group; +} + +static inline struct io_group * +elv_io_get_io_group_bio(struct request_queue *q, struct bio *bio, int create) +{ + return q->elevator->efqd->root_group; +} + +static inline int elv_set_request_ioq(struct request_queue *q, + struct request *rq, struct bio *bio, gfp_t gfp_mask) +{ + return 0; +} + +static inline void +elv_reset_request_ioq(struct request_queue *q, struct request *rq) { } + +static inline struct io_queue * +elv_lookup_ioq_bio(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +static inline void elv_get_rl_iog(struct request_list *rl) { } +static inline void elv_put_rl_iog(struct request_list *rl) { } +static inline void elv_updated_nr_group_requests(struct request_queue *q) { } +static inline void elv_freed_request(struct request_list *rl, int sync) { } +static inline void elv_get_request(struct request_list *rl, int sync) { } + +#endif /* GROUP_IOSCHED */ + +extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name); +extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name, + size_t count); +extern ssize_t elv_slice_async_show(struct elevator_queue *q, char *name); +extern ssize_t elv_slice_async_store(struct elevator_queue *q, const char *name, + size_t count); +extern ssize_t elv_fairness_show(struct elevator_queue *q, char *name); +extern ssize_t elv_fairness_store(struct elevator_queue *q, const char *name, + size_t count); +/* Functions used by elevator.c */ +extern struct elv_fq_data *elv_alloc_fq_data(struct request_queue *q, + struct elevator_queue *e); +extern void elv_release_fq_data(struct elv_fq_data *efqd); +extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e); +extern void elv_exit_fq_data(struct elevator_queue *e); + +extern void elv_ioq_request_add(struct request_queue *q, struct request *rq); +extern void elv_ioq_request_removed(struct elevator_queue *e, + struct request *rq); +extern void elv_dispatched_request_fair(struct elevator_queue *e, + struct request *rq); + +extern void elv_activate_rq_fair(struct request_queue *q, struct request *rq); +extern void elv_deactivate_rq_fair(struct request_queue *q, struct request *rq); + +extern void elv_ioq_completed_request(struct request_queue *q, + struct request *rq); + +extern void *elv_select_ioq(struct request_queue *q, int force); + +/* Functions used by io schedulers */ +extern void elv_put_ioq(struct io_queue *ioq); +extern void elv_ioq_slice_expired(struct request_queue *q, + struct io_queue *ioq); +extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq, + pid_t pid, int is_sync); +extern void elv_init_ioq_io_group(struct io_queue *ioq, struct io_group *iog); +extern void elv_schedule_dispatch(struct request_queue *q); +extern void *elv_io_group_async_queue_prio(struct io_group *iog, + int ioprio_class, int ioprio); +extern void elv_io_group_set_async_queue(struct io_group *iog, int ioprio_class, + int ioprio, struct io_queue *ioq); +extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask); +extern void elv_free_ioq(struct io_queue *ioq); +extern struct io_group *ioq_to_io_group(struct io_queue *ioq); +extern int elv_iog_should_idle(struct io_queue *ioq); + +#else /* CONFIG_ELV_FAIR_QUEUING */ +static inline struct elv_fq_data * +elv_alloc_fq_data(struct request_queue *q, struct elevator_queue *e) +{ + return 0; +} +static inline void elv_release_fq_data(struct elv_fq_data *efqd) {} + +static inline int +elv_init_fq_data(struct request_queue *q, struct elevator_queue *e) +{ + return 0; +} + +static inline void elv_exit_fq_data(struct elevator_queue *e) {} + +static inline void +elv_activate_rq_fair(struct request_queue *q, struct request *rq) {} + +static inline void +elv_deactivate_rq_fair(struct request_queue *q, struct request *rq) {} + +static inline void +elv_dispatched_request_fair(struct elevator_queue *e, struct request *rq) {} + +static inline void +elv_ioq_request_removed(struct elevator_queue *e, struct request *rq) {} + +static inline void +elv_ioq_request_add(struct request_queue *q, struct request *rq) {} + +static inline void +elv_ioq_completed_request(struct request_queue *q, struct request *rq) {} + +static inline void *elv_ioq_sched_queue(struct io_queue *ioq) { return NULL; } +static inline void *elv_select_ioq(struct request_queue *q, int force) +{ + return NULL; +} + +static inline int elv_io_group_allow_merge(struct request *rq, struct bio *bio) + +{ + return 1; +} +static inline int elv_set_request_ioq(struct request_queue *q, + struct request *rq, struct bio *bio, gfp_t gfp_mask) +{ + return 0; +} + +static inline void +elv_reset_request_ioq(struct request_queue *q, struct request *rq) { } + +static inline struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, + struct bio *bio) +{ + return NULL; +} + +static inline void elv_get_rl_iog(struct request_list *rl) { } +static inline void elv_put_rl_iog(struct request_list *rl) { } +static inline void elv_updated_nr_group_requests(struct request_queue *q) { } +static inline void elv_freed_request(struct request_list *rl, int sync) { } +static inline void elv_get_request(struct request_list *rl, int sync) { } + +#endif /* CONFIG_ELV_FAIR_QUEUING */ +#endif /* _ELV_SCHED_H */ +#endif /* CONFIG_BLOCK */ diff --git a/block/elevator.c b/block/elevator.c index 2d511f9..b23db03 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -53,6 +53,15 @@ static const int elv_hash_shift = 6; #define ELV_HASH_ENTRIES (1 << elv_hash_shift) #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) +static inline struct elv_fq_data *elv_efqd(struct elevator_queue *eq) +{ +#ifdef CONFIG_ELV_FAIR_QUEUING + return eq->efqd; +#else + return NULL; +#endif +} + /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. @@ -113,6 +122,10 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) !bio_failfast_driver(bio) != !blk_failfast_driver(rq)) return 0; + /* If rq and bio belongs to different groups, dont allow merging */ + if (!elv_io_group_allow_merge(rq, bio)) + return 0; + if (!elv_iosched_allow_merge(rq, bio)) return 0; @@ -184,17 +197,62 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static void *elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) +static void * +elevator_init_data(struct request_queue *q, struct elevator_queue *eq) +{ + void *data = NULL; + + if (eq->ops->elevator_init_fn) { + data = eq->ops->elevator_init_fn(q, eq); + if (data) + return data; + else + return ERR_PTR(-ENOMEM); + } + + /* IO scheduler does not instanciate data (noop), it is not an error */ + return NULL; +} + +static void +elevator_free_sched_queue(struct elevator_queue *eq, void *sched_queue) { - return eq->ops->elevator_init_fn(q); + /* Not all io schedulers (cfq) strore sched_queue */ + if (!sched_queue) + return; + eq->ops->elevator_free_sched_queue_fn(eq, sched_queue); +} + +static void * +elevator_alloc_sched_queue(struct request_queue *q, struct elevator_queue *eq) +{ + void *sched_queue = NULL; + + /* + * If fair queuing is enabled, then queue allocation takes place + * during set_request() functions when request actually comes + * in. + */ + if (elv_iosched_fair_queuing_enabled(eq)) + return NULL; + + if (eq->ops->elevator_alloc_sched_queue_fn) { + sched_queue = eq->ops->elevator_alloc_sched_queue_fn(q, eq, + GFP_KERNEL, NULL); + if (!sched_queue) + return ERR_PTR(-ENOMEM); + + } + + return sched_queue; } static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, - void *data) + void *data, void *sched_queue) { q->elevator = eq; eq->elevator_data = data; + eq->sched_queue = sched_queue; } static char chosen_elevator[16]; @@ -239,8 +297,21 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, for (i = 0; i < ELV_HASH_ENTRIES; i++) INIT_HLIST_HEAD(&eq->hash[i]); +#ifdef CONFIG_ELV_FAIR_QUEUING + eq->efqd = elv_alloc_fq_data(q, eq); + + if (!eq->efqd) + goto err; + + if (elv_init_fq_data(q, eq)) + goto err; +#endif return eq; err: + if (elv_efqd(eq)) + elv_release_fq_data(elv_efqd(eq)); + if (eq->hash) + kfree(eq->hash); kfree(eq); elevator_put(e); return NULL; @@ -252,6 +323,7 @@ static void elevator_release(struct kobject *kobj) e = container_of(kobj, struct elevator_queue, kobj); elevator_put(e->elevator_type); + elv_release_fq_data(elv_efqd(e)); kfree(e->hash); kfree(e); } @@ -261,7 +333,7 @@ int elevator_init(struct request_queue *q, char *name) struct elevator_type *e = NULL; struct elevator_queue *eq; int ret = 0; - void *data; + void *data = NULL, *sched_queue = NULL; INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; @@ -295,13 +367,21 @@ int elevator_init(struct request_queue *q, char *name) if (!eq) return -ENOMEM; - data = elevator_init_queue(q, eq); - if (!data) { + data = elevator_init_data(q, eq); + + if (IS_ERR(data)) { kobject_put(&eq->kobj); return -ENOMEM; } - elevator_attach(q, eq, data); + sched_queue = elevator_alloc_sched_queue(q, eq); + + if (IS_ERR(sched_queue)) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + + elevator_attach(q, eq, data, sched_queue); return ret; } EXPORT_SYMBOL(elevator_init); @@ -309,6 +389,8 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); + elevator_free_sched_queue(e, e->sched_queue); + elv_exit_fq_data(e); if (e->ops->elevator_exit_fn) e->ops->elevator_exit_fn(e); e->ops = NULL; @@ -438,6 +520,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) elv_rqhash_del(q, rq); q->nr_sorted--; + elv_dispatched_request_fair(q->elevator, rq); boundary = q->end_sector; stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; @@ -478,6 +561,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) elv_rqhash_del(q, rq); q->nr_sorted--; + elv_dispatched_request_fair(q->elevator, rq); q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; @@ -545,6 +629,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, elv_rqhash_del(q, next); q->nr_sorted--; + elv_ioq_request_removed(e, next); q->last_merge = rq; } @@ -593,7 +678,7 @@ void elv_quiesce_start(struct request_queue *q) * make sure we don't have any requests in flight */ elv_drain_elevator(q); - while (q->rq.elvpriv) { + while (q->rq_data.elvpriv) { __blk_run_queue(q); spin_unlock_irq(q->queue_lock); msleep(10); @@ -651,12 +736,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->last_merge = rq; } - /* - * Some ioscheds (cfq) run q->request_fn directly, so - * rq cannot be accessed after calling - * elevator_add_req_fn. - */ q->elevator->ops->elevator_add_req_fn(q, rq); + elv_ioq_request_add(q, rq); break; case ELEVATOR_INSERT_REQUEUE: @@ -696,8 +777,9 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) } if (unplug_it && blk_queue_plugged(q)) { - int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - - queue_in_flight(q); + int nrq = q->rq_data.count[BLK_RW_SYNC] + + q->rq_data.count[BLK_RW_ASYNC] - + queue_in_flight(q); if (nrq >= q->unplug_thresh) __generic_unplug_device(q); @@ -755,13 +837,12 @@ EXPORT_SYMBOL(elv_add_request); int elv_queue_empty(struct request_queue *q) { - struct elevator_queue *e = q->elevator; - if (!list_empty(&q->queue_head)) return 0; - if (e->ops->elevator_queue_empty_fn) - return e->ops->elevator_queue_empty_fn(q); + /* Hopefully nr_sorted works and no need to call queue_empty_fn */ + if (q->nr_sorted) + return 0; return 1; } @@ -785,12 +866,20 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; + /* + * Optimization for noop, deadline and AS which maintain only single + * ioq per io group + */ + if (elv_iosched_single_ioq(e)) + return elv_set_request_ioq(q, rq, bio, gfp_mask); + if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); + return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); rq->elevator_private = NULL; return 0; @@ -800,6 +889,15 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + /* + * Optimization for noop, deadline and AS which maintain only single + * ioq per io group + */ + if (elv_iosched_single_ioq(e)) { + elv_reset_request_ioq(q, rq); + return; + } + if (e->ops->elevator_put_req_fn) e->ops->elevator_put_req_fn(rq); } @@ -841,8 +939,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) - e->ops->elevator_completed_req_fn(q, rq); + if (blk_sorted_rq(rq)) { + if (e->ops->elevator_completed_req_fn) + e->ops->elevator_completed_req_fn(q, rq); + elv_ioq_completed_request(q, rq); + } } /* @@ -995,7 +1096,7 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old_elevator, *e; - void *data; + void *data = NULL, *sched_queue = NULL; /* * Allocate new elevator @@ -1004,10 +1105,18 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (!e) return 0; - data = elevator_init_queue(q, e); - if (!data) { + data = elevator_init_data(q, e); + + if (IS_ERR(data)) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; + } + + sched_queue = elevator_alloc_sched_queue(q, e); + + if (IS_ERR(sched_queue)) { + kobject_put(&e->kobj); + return -ENOMEM; } /* @@ -1024,7 +1133,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * attach and start new elevator */ - elevator_attach(q, e, data); + elevator_attach(q, e, data, sched_queue); spin_unlock_irq(q->queue_lock); @@ -1138,3 +1247,53 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +/* Get the io scheduler queue pointer. */ +void *elv_get_sched_queue(struct request_queue *q, struct request *rq) +{ + /* + * io scheduler is not using fair queuing. Return sched_queue + * pointer stored in elevator_queue. It will be null if io + * scheduler never stored anything there to begin with (cfq) + */ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + /* + * IO schedueler is using fair queuing infrasture. If io scheduler + * has passed a non null rq, retrieve sched_queue pointer from + * there. */ + if (rq) + return elv_ioq_sched_queue(req_ioq(rq)); + + return NULL; +} +EXPORT_SYMBOL(elv_get_sched_queue); + +/* Select an ioscheduler queue to dispatch request from. */ +void *elv_select_sched_queue(struct request_queue *q, int force) +{ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + return elv_ioq_sched_queue(elv_select_ioq(q, force)); +} +EXPORT_SYMBOL(elv_select_sched_queue); + +/* + * Get the io scheduler queue pointer for the group bio belongs to. + * + * If fair queuing is enabled, determine the io group of task and retrieve + * the ioq pointer from that. This is used by only single queue ioschedulers + * for retrieving the queue associated with the group to decide whether the + * new bio can do a front merge or not. + */ +void *elv_get_sched_queue_bio(struct request_queue *q, struct bio *bio) +{ + /* Fair queuing is not enabled. There is only one queue. */ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + return elv_ioq_sched_queue(elv_lookup_ioq_bio(q, bio)); +} +EXPORT_SYMBOL(elv_get_sched_queue_bio); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3a0d369..4ba496f 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -6,8 +6,9 @@ #include #include #include +#include "elevator-fq.h" -struct noop_data { +struct noop_queue { struct list_head queue; }; @@ -19,11 +20,14 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_select_sched_queue(q, force); - if (!list_empty(&nd->queue)) { + if (!nq) + return 0; + + if (!list_empty(&nq->queue)) { struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_entry(nq->queue.next, struct request, queuelist); list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -33,24 +37,17 @@ static int noop_dispatch(struct request_queue *q, int force) static void noop_add_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; - - list_add_tail(&rq->queuelist, &nd->queue); -} - -static int noop_queue_empty(struct request_queue *q) -{ - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - return list_empty(&nd->queue); + list_add_tail(&rq->queuelist, &nq->queue); } static struct request * noop_former_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - if (rq->queuelist.prev == &nd->queue) + if (rq->queuelist.prev == &nq->queue) return NULL; return list_entry(rq->queuelist.prev, struct request, queuelist); } @@ -58,43 +55,57 @@ noop_former_request(struct request_queue *q, struct request *rq) static struct request * noop_latter_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - if (rq->queuelist.next == &nd->queue) + if (rq->queuelist.next == &nq->queue) return NULL; return list_entry(rq->queuelist.next, struct request, queuelist); } -static void *noop_init_queue(struct request_queue *q) +static void *noop_alloc_noop_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) { - struct noop_data *nd; + struct noop_queue *nq; - nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); - if (!nd) - return NULL; - INIT_LIST_HEAD(&nd->queue); - return nd; + nq = kmalloc_node(sizeof(*nq), gfp_mask | __GFP_ZERO, q->node); + if (nq == NULL) + goto out; + + INIT_LIST_HEAD(&nq->queue); +out: + return nq; } -static void noop_exit_queue(struct elevator_queue *e) +static void noop_free_noop_queue(struct elevator_queue *e, void *sched_queue) { - struct noop_data *nd = e->elevator_data; + struct noop_queue *nq = sched_queue; - BUG_ON(!list_empty(&nd->queue)); - kfree(nd); + kfree(nq); } +#ifdef CONFIG_IOSCHED_NOOP_HIER +static struct elv_fs_entry noop_attrs[] = { + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), + __ATTR_NULL +}; +#endif + static struct elevator_type elevator_noop = { .ops = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, - .elevator_queue_empty_fn = noop_queue_empty, .elevator_former_req_fn = noop_former_request, .elevator_latter_req_fn = noop_latter_request, - .elevator_init_fn = noop_init_queue, - .elevator_exit_fn = noop_exit_queue, + .elevator_alloc_sched_queue_fn = noop_alloc_noop_queue, + .elevator_free_sched_queue_fn = noop_free_noop_queue, }, +#ifdef CONFIG_IOSCHED_NOOP_HIER + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, + .elevator_attrs = noop_attrs, +#endif .elevator_name = "noop", .elevator_owner = THIS_MODULE, }; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d952b34..224d5a8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1170,7 +1170,8 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } -int dm_table_any_congested(struct dm_table *t, int bdi_bits) +int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page, + int group) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); @@ -1180,9 +1181,11 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); char b[BDEVNAME_SIZE]; - if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); - else + if (likely(q)) { + struct backing_dev_info *bdi = &q->backing_dev_info; + r |= group ? bdi_congested_group(bdi, bdi_bits, page) + : bdi_congested(bdi, bdi_bits); + } else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a311ea..00a7d94 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1608,7 +1608,8 @@ static void dm_unplug_all(struct request_queue *q) } } -static int dm_any_congested(void *congested_data, int bdi_bits) +static int dm_any_congested(void *congested_data, int bdi_bits, + struct page *page, int group) { int r = bdi_bits; struct mapped_device *md = congested_data; @@ -1625,8 +1626,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) r = md->queue->backing_dev_info.state & bdi_bits; else - r = dm_table_any_congested(map, bdi_bits); - + r = dm_table_any_congested(map, bdi_bits, page, + group); dm_table_put(map); } } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a7663eb..bf533a9 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -57,7 +57,8 @@ struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); -int dm_table_any_congested(struct dm_table *t, int bdi_bits); +int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page, + int group); int dm_table_any_busy_target(struct dm_table *t); int dm_table_set_type(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5fe39c2..10765da 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -102,7 +102,7 @@ static void linear_unplug(struct request_queue *q) rcu_read_unlock(); } -static int linear_congested(void *data, int bits) +static int linear_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; linear_conf_t *conf; @@ -113,7 +113,10 @@ static int linear_congested(void *data, int bits) for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + struct backing_dev_info *bdi = &q->backing_dev_info; + + ret |= group ? bdi_congested_group(bdi, bits, page) : + bdi_congested(bdi, bits); } rcu_read_unlock(); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7140909..52a54c7 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -192,7 +192,8 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(void *data, int bits, struct page *page, + int group) { mddev_t *mddev = data; multipath_conf_t *conf = mddev->private; @@ -203,8 +204,10 @@ static int multipath_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); /* Just like multipath_map, we just check the * first available device */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 898e2bd..915a95f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -37,7 +37,7 @@ static void raid0_unplug(struct request_queue *q) } } -static int raid0_congested(void *data, int bits) +static int raid0_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; raid0_conf_t *conf = mddev->private; @@ -46,8 +46,10 @@ static int raid0_congested(void *data, int bits) for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); } return ret; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8726fd7..0f0c6ac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -570,7 +570,7 @@ static void raid1_unplug(struct request_queue *q) md_wakeup_thread(mddev->thread); } -static int raid1_congested(void *data, int bits) +static int raid1_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; conf_t *conf = mddev->private; @@ -581,14 +581,17 @@ static int raid1_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ if ((bits & (1<backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, + page) : bdi_congested(bdi, bits); else - ret &= bdi_congested(&q->backing_dev_info, bits); + ret &= group ? bdi_congested_group(bdi, bits, + page) : bdi_congested(bdi, bits); } } rcu_read_unlock(); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3d9020c..d85351f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -625,7 +625,7 @@ static void raid10_unplug(struct request_queue *q) md_wakeup_thread(mddev->thread); } -static int raid10_congested(void *data, int bits) +static int raid10_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; conf_t *conf = mddev->private; @@ -636,8 +636,10 @@ static int raid10_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); } } rcu_read_unlock(); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b8a2c5d..b6cc455 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3323,7 +3323,7 @@ static void raid5_unplug_device(struct request_queue *q) unplug_slaves(mddev); } -static int raid5_congested(void *data, int bits) +static int raid5_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; raid5_conf_t *conf = mddev->private; diff --git a/fs/afs/write.c b/fs/afs/write.c index c2e7a7f..aa8b359 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -455,7 +455,7 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) + if (wbc->nonblocking && bdi_or_group_write_congested(bdi, page)) wbc->encountered_congestion = 1; _leave(" = 0"); @@ -491,6 +491,12 @@ static int afs_writepages_region(struct address_space *mapping, return 0; } + if (wbc->nonblocking && bdi_write_congested_group(bdi, page)) { + wbc->encountered_congestion = 1; + page_cache_release(page); + break; + } + /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e83be2e..35cd95a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1249,7 +1249,8 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, return root; } -static int btrfs_congested_fn(void *congested_data, int bdi_bits) +static int btrfs_congested_fn(void *congested_data, int bdi_bits, + struct page *page, int group) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; @@ -1260,7 +1261,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi && (group ? bdi_congested_group(bdi, bdi_bits, page) : + bdi_congested(bdi, bdi_bits))) { ret = 1; break; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6826018..fd7d53f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2368,6 +2368,18 @@ retry: unsigned i; scanned = 1; + + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5dbefd1..ed2d100 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -165,6 +165,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + struct page *page; bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; @@ -276,8 +277,11 @@ loop_lock: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && batch_run > 32 && - fs_info->fs_devices->open_devices > 1) { + if (pending) + page = bio_iovec_idx(pending, 0)->bv_page; + + if (pending && bdi_or_group_write_congested(bdi, page) && + num_run > 32 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; ioc = current->io_context; diff --git a/fs/buffer.c b/fs/buffer.c index 28f320f..8efcd82 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -668,6 +669,7 @@ static void __set_page_dirty(struct page *page, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c34b7f8..33d0339 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1470,6 +1470,17 @@ retry: n_iov = 0; bytes_to_write = 0; + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking && + bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { page = pvec.pages[i]; /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 8b10b87..185ba0a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -797,6 +798,7 @@ static int do_direct_IO(struct dio *dio) ret = PTR_ERR(page); goto out; } + blkio_cgroup_reset_owner(page, current->mm); while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9..090a961 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -179,7 +179,7 @@ static void ext2_preread_inode(struct inode *inode) struct backing_dev_info *bdi; bdi = inode->i_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_or_group_read_congested(bdi, NULL)) return; if (bdi_write_congested(bdi)) return; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 7ebae9a..f5fba6c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -371,6 +371,18 @@ retry: PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { scanned = 1; + + /* + * If io group page belongs to is congested. bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); if (ret) done = 1; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17..aa29612 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -266,8 +266,9 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) { struct bio *bio = wi->bio; int err; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; - if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { + if (wi->nbio > 0 && bdi_or_group_write_congested(wi->bdi, page)) { wait_for_completion(&wi->bio_event); wi->nbio--; if (unlikely(atomic_read(&wi->err))) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index aecf251..5835a2e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -891,7 +891,7 @@ xfs_convert_page( bdi = inode->i_mapping->backing_dev_info; wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { + if (bdi_or_group_write_congested(bdi, page)) { wbc->encountered_congestion = 1; done = 1; } else if (wbc->nr_to_write <= 0) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 965df12..473223a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -714,7 +714,7 @@ xfs_buf_readahead( struct backing_dev_info *bdi; bdi = target->bt_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_or_group_read_congested(bdi, NULL)) return; flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1d52425..1b13539 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ enum bdi_state { BDI_unused, /* Available bits start here */ }; -typedef int (congested_fn)(void *, int); +typedef int (congested_fn)(void *, int, struct page *, int); enum bdi_stat_item { BDI_RECLAIMABLE, @@ -209,7 +209,7 @@ int writeback_in_progress(struct backing_dev_info *bdi); static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); + return bdi->congested_fn(bdi->congested_data, bdi_bits, NULL, 0); return (bdi->state & bdi_bits); } @@ -229,6 +229,63 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << BDI_async_congested)); } +#ifdef CONFIG_GROUP_IOSCHED +extern int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits, + struct page *page); + +extern int bdi_read_congested_group(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_write_congested_group(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_rw_congested_group(struct backing_dev_info *bdi, + struct page *page); +#else /* CONFIG_GROUP_IOSCHED */ +static inline int bdi_congested_group(struct backing_dev_info *bdi, + int bdi_bits, struct page *page) +{ + return bdi_congested(bdi, bdi_bits); +} + +static inline int bdi_read_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi); +} + +static inline int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi); +} + +static inline int bdi_write_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi); +} + +static inline int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi); +} + +static inline int bdi_rw_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_rw_congested(bdi); +} + +#endif /* CONFIG_GROUP_IOSCHED */ + enum { BLK_RW_ASYNC = 0, BLK_RW_SYNC = 1, @@ -237,7 +294,7 @@ enum { void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); - +extern void congestion_wake_up(int sync); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/biotrack.h b/include/linux/biotrack.h new file mode 100644 index 0000000..2b8bb0b --- /dev/null +++ b/include/linux/biotrack.h @@ -0,0 +1,100 @@ +#include +#include +#include + +#ifndef _LINUX_BIOTRACK_H +#define _LINUX_BIOTRACK_H + +#ifdef CONFIG_CGROUP_BLKIO + +struct io_context; +struct block_device; + +struct blkio_cgroup { + struct cgroup_subsys_state css; + struct io_context *io_context; /* default io_context */ +/* struct radix_tree_root io_context_root; per device io_context */ +}; + +/** + * __init_blkio_page_cgroup() - initialize a blkio_page_cgroup + * @pc: page_cgroup of the page + * + * Reset the owner ID of a page. + */ +static inline void __init_blkio_page_cgroup(struct page_cgroup *pc) +{ + pc->blkio_cgroup_id = 0; +} + +/** + * blkio_cgroup_disabled() - check whether blkio_cgroup is disabled + * + * Returns true if disabled, false if not. + */ +static inline bool blkio_cgroup_disabled(void) +{ + if (blkio_cgroup_subsys.disabled) + return true; + return false; +} + +extern void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm); +extern void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm); +extern void blkio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm); +extern void blkio_cgroup_copy_owner(struct page *page, struct page *opage); + +extern struct io_context *get_blkio_cgroup_iocontext(struct bio *bio); +extern unsigned long get_blkio_cgroup_id(struct bio *bio); +extern struct cgroup *get_cgroup_from_page(struct page *page); + +#else /* !CONFIG_CGROUP_BLKIO */ + +struct blkio_cgroup; + +static inline void __init_blkio_page_cgroup(struct page_cgroup *pc) +{ +} + +static inline bool blkio_cgroup_disabled(void) +{ + return true; +} + +static inline void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_reset_owner(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_copy_owner(struct page *page, struct page *opage) +{ +} + +static inline struct io_context *get_blkio_cgroup_iocontext(struct bio *bio) +{ + return NULL; +} + +static inline unsigned long get_blkio_cgroup_id(struct bio *bio) +{ + return 0; +} + +static inline struct cgroup *get_cgroup_from_page(struct page *page) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_BLKIO */ + +#endif /* _LINUX_BIOTRACK_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 69103e0..247e237 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -32,21 +32,51 @@ struct request; struct sg_io_hdr; #define BLKDEV_MIN_RQ 4 + +#ifdef CONFIG_GROUP_IOSCHED +#define BLKDEV_MAX_RQ 512 /* Default maximum for queue */ +#define BLKDEV_MAX_GROUP_RQ 128 /* Default maximum per group*/ +#else #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* + * This is eqivalent to case of only one group present (root group). Let + * it consume all the request descriptors available on the queue . + */ +#define BLKDEV_MAX_GROUP_RQ BLKDEV_MAX_RQ /* Default maximum */ +#endif struct request; typedef void (rq_end_io_fn)(struct request *, int); struct request_list { /* - * count[], starved[], and wait[] are indexed by + * count[], starved and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC */ int count[2]; int starved[2]; + wait_queue_head_t wait[2]; +}; + +/* + * This data structures keeps track of mempool of requests for the queue + * and some overall statistics. + */ +struct request_data { + /* + * Per queue request descriptor count. This is in addition to per + * cgroup count + */ + int count[2]; int elvpriv; mempool_t *rq_pool; - wait_queue_head_t wait[2]; + int starved; + /* + * Global list for starved tasks. A task will be queued here if + * it could not allocate request descriptor and the associated + * group request list does not have any requests pending. + */ + wait_queue_head_t starved_wait; }; /* @@ -229,6 +259,11 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_ELV_FAIR_QUEUING + /* io queue request belongs to */ + struct io_queue *ioq; +#endif }; static inline unsigned short req_get_ioprio(struct request *req) @@ -236,6 +271,15 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } +static inline struct io_queue *req_ioq(struct request *req) +{ +#ifdef CONFIG_ELV_FAIR_QUEUING + return req->ioq; +#else + return NULL; +#endif +} + /* * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME * requests. Some step values could eventually be made generic. @@ -325,10 +369,17 @@ struct request_queue struct request *last_merge; struct elevator_queue *elevator; +#ifndef CONFIG_GROUP_IOSCHED /* * the queue request freelist, one for reads and one for writes + * In case of group io scheduling, this request list is per group + * and is present in group data structure. */ struct request_list rq; +#endif + + /* Contains request pool and other data like starved data */ + struct request_data rq_data; request_fn_proc *request_fn; make_request_fn *make_request_fn; @@ -391,6 +442,8 @@ struct request_queue * queue settings */ unsigned long nr_requests; /* Max # of requests */ + /* Max # of per io group requests */ + unsigned long nr_group_requests; unsigned int nr_congestion_on; unsigned int nr_congestion_off; unsigned int nr_batching; @@ -770,6 +823,10 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); +extern void blk_init_request_list(struct request_list *rl); + +extern struct request_list *blk_get_request_list(struct request_queue *q, + struct bio *bio); /* * A queue has just exitted congestion. Note this in the global counter of * congested queues, and wake up anyone who was waiting for requests to be @@ -789,6 +846,11 @@ static inline void blk_set_queue_congested(struct request_queue *q, int sync) set_bdi_congested(&q->backing_dev_info, sync); } +#ifdef CONFIG_GROUP_IOSCHED +extern int blk_queue_io_group_congested(struct backing_dev_info *bdi, + int bdi_bits, struct page *page); +#endif + extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..78504f3 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -43,6 +43,12 @@ SUBSYS(mem_cgroup) /* */ +#ifdef CONFIG_CGROUP_BLKIO +SUBSYS(blkio_cgroup) +#endif + +/* */ + #ifdef CONFIG_CGROUP_DEVICE SUBSYS(devices) #endif @@ -60,3 +66,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_GROUP_IOSCHED +SUBSYS(io) +#endif + +/* */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1cb3372..0ace96e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -22,13 +22,28 @@ typedef struct request *(elevator_request_list_fn) (struct request_queue *, stru typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); typedef int (elevator_may_queue_fn) (struct request_queue *, int); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); +typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, + struct bio *bio, gfp_t); typedef void (elevator_put_req_fn) (struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); -typedef void *(elevator_init_fn) (struct request_queue *); +typedef void *(elevator_init_fn) (struct request_queue *, + struct elevator_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *); +typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *); +typedef void* (elevator_alloc_sched_queue_fn) (struct request_queue *q, + struct elevator_queue *eq, gfp_t, struct io_queue *ioq); +#ifdef CONFIG_ELV_FAIR_QUEUING +typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *, int); +typedef void (elevator_active_ioq_reset_fn) (struct request_queue *, void*); +typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*); +typedef int (elevator_should_preempt_fn) (struct request_queue*, void*, + struct request*); +typedef struct io_queue* (elevator_close_cooperator_fn) (struct request_queue*, + void*); +typedef int (elevator_expire_ioq_fn) (struct request_queue*, void *, int, int); +#endif struct elevator_ops { @@ -56,6 +71,18 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; void (*trim)(struct io_context *); + + elevator_alloc_sched_queue_fn *elevator_alloc_sched_queue_fn; + elevator_free_sched_queue_fn *elevator_free_sched_queue_fn; +#ifdef CONFIG_ELV_FAIR_QUEUING + elevator_active_ioq_set_fn *elevator_active_ioq_set_fn; + elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn; + + elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn; + elevator_should_preempt_fn *elevator_should_preempt_fn; + elevator_close_cooperator_fn *elevator_close_cooperator_fn; + elevator_expire_ioq_fn *elevator_expire_ioq_fn; +#endif }; #define ELV_NAME_MAX (16) @@ -76,6 +103,9 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; +#ifdef CONFIG_ELV_FAIR_QUEUING + int elevator_features; +#endif }; /* @@ -85,10 +115,15 @@ struct elevator_queue { struct elevator_ops *ops; void *elevator_data; + void *sched_queue; struct kobject kobj; struct elevator_type *elevator_type; struct mutex sysfs_lock; struct hlist_head *hash; +#ifdef CONFIG_ELV_FAIR_QUEUING + /* fair queuing data */ + struct elv_fq_data *efqd; +#endif }; /* @@ -112,7 +147,8 @@ extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *, struct request *, gfp_t); +extern int elv_set_request(struct request_queue *, struct request *, + struct bio *bio, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); @@ -207,5 +243,54 @@ enum { __val; \ }) +/* iosched can let elevator know their feature set/capability */ +#ifdef CONFIG_ELV_FAIR_QUEUING + +/* iosched wants to use fair queuing logic of elevator layer */ +#define ELV_IOSCHED_NEED_FQ 1 + +/* iosched maintains only single ioq per group.*/ +#define ELV_IOSCHED_SINGLE_IOQ 2 + +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e) +{ + return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ; +} + +static inline int elv_iosched_single_ioq(struct elevator_queue *e) +{ + return (e->elevator_type->elevator_features) & ELV_IOSCHED_SINGLE_IOQ; +} + +#else /* ELV_IOSCHED_FAIR_QUEUING */ + +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e) +{ + return 0; +} + +static inline int elv_iosched_single_ioq(struct elevator_queue *e) +{ + return 0; +} + +#endif /* ELV_IOSCHED_FAIR_QUEUING */ +extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq); +extern void *elv_select_sched_queue(struct request_queue *q, int force); +extern void *elv_get_sched_queue_bio(struct request_queue *q, struct bio *bio); + +/* + * This is equivalent of rq_is_sync()/cfq_bio_sync() function where we + * determine whether an rq/bio is sync or not. There are cases like during + * merging and during * request allocation, where we don't have rq but bio + * and needs to find out * if this bio will be considered as sync or async by + * elevator/iosched. This function is useful in such cases. + */ +static inline int elv_bio_sync(struct bio *bio) +{ + if ((bio_data_dir(bio) == READ) || bio_sync(bio)) + return 1; + return 0; +} #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 4da4a75..1baa6c1 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -73,6 +73,11 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_GROUP_IOSCHED + /* If task changes the cgroup, elevator processes it asynchronously */ + unsigned short cgroup_changed; +#endif + /* * For request batching */ @@ -104,6 +109,7 @@ int put_io_context(struct io_context *ioc); void exit_io_context(void); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +void init_io_context(struct io_context *ioc); void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else static inline void exit_io_context(void) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e46a073..eb45fe9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -37,6 +37,8 @@ struct mm_struct; * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ +extern void __init_mem_page_cgroup(struct page_cgroup *pc); + extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ @@ -121,6 +123,10 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; +static inline void __init_mem_page_cgroup(struct page_cgroup *pc) +{ +} + static inline int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8895985..c9d1ed4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -605,7 +605,7 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE struct page_cgroup *node_page_cgroup; #endif #endif @@ -956,7 +956,7 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE /* * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use * section. (see memcontrol.h/page_cgroup.h about this.) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 13f126c..bca6c8a 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,7 +1,7 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE #include /* * Page Cgroup can be considered as an extended mem_map. @@ -14,6 +14,7 @@ struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; struct page *page; + unsigned long blkio_cgroup_id; struct list_head lru; /* per cgroup LRU list */ }; @@ -83,7 +84,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_CGROUP_PAGE */ struct page_cgroup; static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9a74b46..af6c9e5 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -397,7 +397,8 @@ TRACE_EVENT(block_unplug_timer, ), TP_fast_assign( - __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + __entry->nr_rq = q->rq_data.count[READ] + + q->rq_data.count[WRITE]; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -416,7 +417,8 @@ TRACE_EVENT(block_unplug_io, ), TP_fast_assign( - __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + __entry->nr_rq = q->rq_data.count[READ] + + q->rq_data.count[WRITE]; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), diff --git a/init/Kconfig b/init/Kconfig index 3f7e609..54aa85a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -612,8 +612,32 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config GROUP_IOSCHED + bool + depends on CGROUPS && ELV_FAIR_QUEUING + default n + ---help--- + This feature lets IO scheduler recognize task groups and control + disk bandwidth allocation to such task groups. + endif # CGROUPS +config CGROUP_BLKIO + bool + depends on CGROUPS && BLOCK + select MM_OWNER + default n + ---help--- + Provides a Resource Controller which enables to track the onwner + of every Block I/O requests. + The information this subsystem provides can be used from any + kind of module such as dm-ioband device mapper modules or + the cfq-scheduler. + +config CGROUP_PAGE + def_bool y + depends on CGROUP_MEM_RES_CTLR || CGROUP_BLKIO + config MM_OWNER bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7a34cb5..9a03980 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -786,7 +786,8 @@ static void blk_add_trace_unplug_io(struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + unsigned int pdu = q->rq_data.count[READ] + + q->rq_data.count[WRITE]; __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, @@ -799,7 +800,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + unsigned int pdu = q->rq_data.count[READ] + + q->rq_data.count[WRITE]; __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd64..6208744 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,6 +39,8 @@ else obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o +obj-$(CONFIG_CGROUP_BLKIO) += biotrack.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c86edd2..60c91e4 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -7,6 +7,7 @@ #include #include #include +#include "../block/elevator-fq.h" void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -283,16 +284,22 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +void congestion_wake_up(int sync) +{ + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + if (waitqueue_active(wqh)) + wake_up(wqh); +} + void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; - wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; clear_bit(bit, &bdi->state); smp_mb__after_clear_bit(); - if (waitqueue_active(wqh)) - wake_up(wqh); + congestion_wake_up(sync); } EXPORT_SYMBOL(clear_bdi_congested); @@ -327,3 +334,64 @@ long congestion_wait(int sync, long timeout) } EXPORT_SYMBOL(congestion_wait); +/* + * With group IO scheduling, there are request descriptors per io group per + * queue. So generic notion of whether queue is congested or not is not + * very accurate. Queue might not be congested but the io group in which + * request will go might actually be congested. + * + * Hence to get the correct idea about congestion level, one should query + * the io group congestion status on the queue. Pass in the page information + * which can be used to determine the io group of the page and congestion + * status can be determined accordingly. + * + * If page info is not passed, io group is determined from the current task + * context. + */ +#ifdef CONFIG_GROUP_IOSCHED +int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits, + struct page *page) +{ + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, bdi_bits, page, 1); + + return blk_queue_io_group_congested(bdi, bdi_bits, page); +} +EXPORT_SYMBOL(bdi_congested_group); + +int bdi_read_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, 1 << BDI_sync_congested, page); +} +EXPORT_SYMBOL(bdi_read_congested_group); + +/* Checks if either bdi or associated group is read congested */ +int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi) || bdi_read_congested_group(bdi, page); +} +EXPORT_SYMBOL(bdi_or_group_read_congested); + +int bdi_write_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, 1 << BDI_async_congested, page); +} +EXPORT_SYMBOL(bdi_write_congested_group); + +/* Checks if either bdi or associated group is write congested */ +int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi) || bdi_write_congested_group(bdi, page); +} +EXPORT_SYMBOL(bdi_or_group_write_congested); + +int bdi_rw_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, (1 << BDI_sync_congested) | + (1 << BDI_async_congested), page); +} +EXPORT_SYMBOL(bdi_rw_congested_group); + +#endif /* CONFIG_GROUP_IOSCHED */ diff --git a/mm/biotrack.c b/mm/biotrack.c new file mode 100644 index 0000000..1da7d1e --- /dev/null +++ b/mm/biotrack.c @@ -0,0 +1,293 @@ +/* biotrack.c - Block I/O Tracking + * + * Copyright (C) VA Linux Systems Japan, 2008-2009 + * Developed by Hirokazu Takahashi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +/* + * The block I/O tracking mechanism is implemented on the cgroup memory + * controller framework. It helps to find the the owner of an I/O request + * because every I/O request has a target page and the owner of the page + * can be easily determined on the framework. + */ + +/* Return the blkio_cgroup that associates with a cgroup. */ +static inline struct blkio_cgroup *cgroup_blkio(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, blkio_cgroup_subsys_id), + struct blkio_cgroup, css); +} + +/* Return the blkio_cgroup that associates with a process. */ +static inline struct blkio_cgroup *blkio_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, blkio_cgroup_subsys_id), + struct blkio_cgroup, css); +} + +static struct io_context default_blkio_io_context; +static struct blkio_cgroup default_blkio_cgroup = { + .io_context = &default_blkio_io_context, +}; + +/** + * blkio_cgroup_set_owner() - set the owner ID of a page. + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Make a given page have the blkio-cgroup ID of the owner of this page. + */ +void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ + struct blkio_cgroup *biog; + struct page_cgroup *pc; + + if (blkio_cgroup_disabled()) + return; + pc = lookup_page_cgroup(page); + if (unlikely(!pc)) + return; + + pc->blkio_cgroup_id = 0; /* 0: default blkio_cgroup id */ + if (!mm) + return; + /* + * Locking "pc" isn't necessary here since the current process is + * the only one that can access the members related to blkio_cgroup. + */ + rcu_read_lock(); + biog = blkio_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!biog)) + goto out; + /* + * css_get(&bio->css) isn't called to increment the reference + * count of this blkio_cgroup "biog" so pc->blkio_cgroup_id + * might turn invalid even if this page is still active. + * This approach is chosen to minimize the overhead. + */ + pc->blkio_cgroup_id = css_id(&biog->css); +out: + rcu_read_unlock(); +} + +/** + * blkio_cgroup_reset_owner() - reset the owner ID of a page + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Change the owner of a given page if necessary. + */ +void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm) +{ + /* + * A little trick: + * Just call blkio_cgroup_set_owner() for pages which are already + * active since the blkio_cgroup_id member of page_cgroup can be + * updated without any locks. This is because an integer type of + * variable can be set a new value at once on modern cpus. + */ + blkio_cgroup_set_owner(page, mm); +} + +/** + * blkio_cgroup_reset_owner_pagedirty() - reset the owner ID of a pagecache page + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Change the owner of a given page if the page is in the pagecache. + */ +void blkio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm) +{ + if (!page_is_file_cache(page)) + return; + if (current->flags & PF_MEMALLOC) + return; + + blkio_cgroup_reset_owner(page, mm); +} + +/** + * blkio_cgroup_copy_owner() - copy the owner ID of a page into another page + * @npage: the page where we want to copy the owner + * @opage: the page from which we want to copy the ID + * + * Copy the owner ID of @opage into @npage. + */ +void blkio_cgroup_copy_owner(struct page *npage, struct page *opage) +{ + struct page_cgroup *npc, *opc; + + if (blkio_cgroup_disabled()) + return; + npc = lookup_page_cgroup(npage); + if (unlikely(!npc)) + return; + opc = lookup_page_cgroup(opage); + if (unlikely(!opc)) + return; + + /* + * Do this without any locks. The reason is the same as + * blkio_cgroup_reset_owner(). + */ + npc->blkio_cgroup_id = opc->blkio_cgroup_id; +} + +/* Create a new blkio-cgroup. */ +static struct cgroup_subsys_state * +blkio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct blkio_cgroup *biog; + struct io_context *ioc; + + if (!cgrp->parent) { + biog = &default_blkio_cgroup; + init_io_context(biog->io_context); + /* Increment the referrence count not to be released ever. */ + atomic_long_inc(&biog->io_context->refcount); + return &biog->css; + } + + biog = kzalloc(sizeof(*biog), GFP_KERNEL); + if (!biog) + return ERR_PTR(-ENOMEM); + ioc = alloc_io_context(GFP_KERNEL, -1); + if (!ioc) { + kfree(biog); + return ERR_PTR(-ENOMEM); + } + biog->io_context = ioc; + return &biog->css; +} + +/* Delete the blkio-cgroup. */ +static void blkio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct blkio_cgroup *biog = cgroup_blkio(cgrp); + + put_io_context(biog->io_context); + free_css_id(&blkio_cgroup_subsys, &biog->css); + kfree(biog); +} + +/** + * get_blkio_cgroup_id() - determine the blkio-cgroup ID + * @bio: the &struct bio which describes the I/O + * + * Returns the blkio-cgroup ID of a given bio. A return value zero + * means that the page associated with the bio belongs to default_blkio_cgroup. + */ +unsigned long get_blkio_cgroup_id(struct bio *bio) +{ + struct page_cgroup *pc; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; + unsigned long id = 0; + + pc = lookup_page_cgroup(page); + if (pc) + id = pc->blkio_cgroup_id; + return id; +} + +/** + * get_blkio_cgroup_iocontext() - determine the blkio-cgroup iocontext + * @bio: the &struct bio which describe the I/O + * + * Returns the iocontext of blkio-cgroup that issued a given bio. + */ +struct io_context *get_blkio_cgroup_iocontext(struct bio *bio) +{ + struct cgroup_subsys_state *css; + struct blkio_cgroup *biog; + struct io_context *ioc; + unsigned long id; + + id = get_blkio_cgroup_id(bio); + rcu_read_lock(); + css = css_lookup(&blkio_cgroup_subsys, id); + if (css) + biog = container_of(css, struct blkio_cgroup, css); + else + biog = &default_blkio_cgroup; + ioc = biog->io_context; /* default io_context for this cgroup */ + atomic_long_inc(&ioc->refcount); + rcu_read_unlock(); + return ioc; +} + +/** + * get_cgroup_from_page() - determine the cgroup from a page. + * @page: the page to be tracked + * + * Returns the cgroup of a given page. A return value zero means that + * the page associated with the page belongs to default_blkio_cgroup. + * + * Note: + * This function must be called under rcu_read_lock(). + */ +struct cgroup *get_cgroup_from_page(struct page *page) +{ + struct page_cgroup *pc; + struct cgroup_subsys_state *css; + + pc = lookup_page_cgroup(page); + if (!pc) + return NULL; + + css = css_lookup(&blkio_cgroup_subsys, pc->blkio_cgroup_id); + if (!css) + return NULL; + + return css->cgroup; +} + +EXPORT_SYMBOL(get_blkio_cgroup_id); +EXPORT_SYMBOL(get_blkio_cgroup_iocontext); +EXPORT_SYMBOL(get_cgroup_from_page); + +/* Read the ID of the specified blkio cgroup. */ +static u64 blkio_id_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct blkio_cgroup *biog = cgroup_blkio(cgrp); + + return (u64)css_id(&biog->css); +} + +static struct cftype blkio_files[] = { + { + .name = "id", + .read_u64 = blkio_id_read, + }, +}; + +static int blkio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +struct cgroup_subsys blkio_cgroup_subsys = { + .name = "blkio", + .create = blkio_cgroup_create, + .destroy = blkio_cgroup_destroy, + .populate = blkio_cgroup_populate, + .subsys_id = blkio_cgroup_subsys_id, + .use_id = 1, +}; diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..7ad8d44 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -210,6 +211,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + blkio_cgroup_copy_owner(to->bv_page, page); if (rw == WRITE) { char *vto, *vfrom; diff --git a/mm/filemap.c b/mm/filemap.c index ccea3b6..01c47a1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include +#include #include /* for page_is_file_cache() */ #include "internal.h" @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; + blkio_cgroup_set_owner(page, current->mm); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d..baf4be7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -129,6 +129,12 @@ struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; }; +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc) +{ + pc->mem_cgroup = NULL; + INIT_LIST_HEAD(&pc->lru); +} + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide diff --git a/mm/memory.c b/mm/memory.c index aede2ce..346f368 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2116,6 +2117,7 @@ gotten: */ ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + blkio_cgroup_set_owner(new_page, mm); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -2581,6 +2583,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + blkio_cgroup_reset_owner(page, mm); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2645,6 +2648,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ @@ -2792,6 +2796,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (anon) { inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 81627eb..f924e05 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -984,6 +985,17 @@ retry: if (nr_pages == 0) break; + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1247,6 +1259,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4eb..29bf26c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,14 +9,15 @@ #include #include #include +#include static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) { pc->flags = 0; - pc->mem_cgroup = NULL; pc->page = pfn_to_page(pfn); - INIT_LIST_HEAD(&pc->lru); + __init_mem_page_cgroup(pc); + __init_blkio_page_cgroup(pc); } static unsigned long total_usage; @@ -74,7 +75,7 @@ void __init page_cgroup_init_flatmem(void) int nid, fail; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && blkio_cgroup_disabled()) return; for_each_online_node(nid) { @@ -83,12 +84,13 @@ void __init page_cgroup_init_flatmem(void) goto fail; } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" - " don't want memory cgroups\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option" + " if you don't want memory and blkio cgroups\n"); return; fail: printk(KERN_CRIT "allocation of page_cgroup failed.\n"); - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); + printk(KERN_CRIT + "please try 'cgroup_disable=memory,blkio' boot option\n"); panic("Out of memory"); } @@ -245,7 +247,7 @@ void __init page_cgroup_init(void) unsigned long pfn; int fail = 0; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && blkio_cgroup_disabled()) return; for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { @@ -254,14 +256,15 @@ void __init page_cgroup_init(void) fail = init_section_page_cgroup(pfn); } if (fail) { - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); + printk(KERN_CRIT + "try 'cgroup_disable=memory,blkio' boot option\n"); panic("Out of memory"); } else { hotplug_memory_notifier(page_cgroup_callback, 0); } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" - " want memory cgroups\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option" + " if you don't want memory and blkio cgroups\n"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa23..22e0639 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -542,7 +542,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_or_group_read_congested(mapping->backing_dev_info, NULL)) return; /* do read-ahead */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38e..6eb96f1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ __set_page_locked(new_page); SetPageSwapBacked(new_page); + blkio_cgroup_set_owner(new_page, current->mm); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /*