Documentation/block/00-INDEX | 2 + Documentation/block/io-controller.txt | 452 ++++ block/Kconfig.iosched | 80 + block/Makefile | 1 + block/as-iosched.c | 503 ++++- block/blk-core.c | 333 +++- block/blk-ioc.c | 33 +- block/blk-settings.c | 1 + block/blk-sysfs.c | 73 +- block/blk.h | 4 + block/cfq-iosched.c | 1205 ++++------- block/deadline-iosched.c | 129 +- block/elevator-fq.c | 4161 +++++++++++++++++++++++++++++++++ block/elevator-fq.h | 769 ++++++ block/elevator.c | 198 ++- block/noop-iosched.c | 73 +- drivers/md/dm-table.c | 11 +- drivers/md/dm.c | 7 +- drivers/md/dm.h | 3 +- drivers/md/linear.c | 7 +- drivers/md/multipath.c | 7 +- drivers/md/raid0.c | 6 +- drivers/md/raid1.c | 9 +- drivers/md/raid10.c | 6 +- drivers/md/raid5.c | 2 +- fs/afs/write.c | 8 +- fs/btrfs/disk-io.c | 6 +- fs/btrfs/extent_io.c | 12 + fs/btrfs/volumes.c | 8 +- fs/buffer.c | 2 + fs/cifs/file.c | 11 + fs/direct-io.c | 2 + fs/ext2/ialloc.c | 2 +- fs/gfs2/aops.c | 12 + fs/nilfs2/segbuf.c | 3 +- fs/xfs/linux-2.6/xfs_aops.c | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 2 +- include/linux/backing-dev.h | 61 +- include/linux/biotrack.h | 103 + include/linux/blkdev.h | 106 +- include/linux/cgroup_subsys.h | 12 + include/linux/elevator.h | 92 +- include/linux/iocontext.h | 6 + include/linux/memcontrol.h | 6 + include/linux/mmzone.h | 4 +- include/linux/page_cgroup.h | 31 +- init/Kconfig | 23 + mm/Makefile | 4 +- mm/backing-dev.c | 62 + mm/biotrack.c | 321 +++ mm/bounce.c | 2 + mm/filemap.c | 2 + mm/memcontrol.c | 6 + mm/memory.c | 5 + mm/page-writeback.c | 13 + mm/page_cgroup.c | 17 +- mm/readahead.c | 2 +- mm/swap_state.c | 2 + 58 files changed, 7877 insertions(+), 1148 deletions(-) diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index 961a051..dc8bf95 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -10,6 +10,8 @@ capability.txt - Generic Block Device Capability (/sys/block//capability) deadline-iosched.txt - Deadline IO scheduler tunables +io-controller.txt + - IO controller for provding hierarchical IO scheduling ioprio.txt - Block io priorities (in CFQ scheduler) request.txt diff --git a/Documentation/block/io-controller.txt b/Documentation/block/io-controller.txt new file mode 100644 index 0000000..82804ad --- /dev/null +++ b/Documentation/block/io-controller.txt @@ -0,0 +1,452 @@ + IO Controller + ============= + +Overview +======== + +This patchset implements a proportional weight IO controller. That is one +can create cgroups and assign prio/weights to those cgroups and task group +will get access to disk proportionate to the weight of the group. + +These patches modify elevator layer and individual IO schedulers to do +IO control hence this io controller works only on block devices which use +one of the standard io schedulers can not be used with any xyz logical block +device. + +The assumption/thought behind modifying IO scheduler is that resource control +is primarily needed on leaf nodes where the actual contention for resources is +present and not on intertermediate logical block devices. + +Consider following hypothetical scenario. Lets say there are three physical +disks, namely sda, sdb and sdc. Two logical volumes (lv0 and lv1) have been +created on top of these. Some part of sdb is in lv0 and some part is in lv1. + + lv0 lv1 + / \ / \ + sda sdb sdc + +Also consider following cgroup hierarchy + + root + / \ + A B + / \ / \ + T1 T2 T3 T4 + +A and B are two cgroups and T1, T2, T3 and T4 are tasks with-in those cgroups. +Assuming T1, T2, T3 and T4 are doing IO on lv0 and lv1. These tasks should +get their fair share of bandwidth on disks sda, sdb and sdc. There is no +IO control on intermediate logical block nodes (lv0, lv1). + +So if tasks T1 and T2 are doing IO on lv0 and T3 and T4 are doing IO on lv1 +only, there will not be any contetion for resources between group A and B if +IO is going to sda or sdc. But if actual IO gets translated to disk sdb, then +IO scheduler associated with the sdb will distribute disk bandwidth to +group A and B proportionate to their weight. + +CFQ already has the notion of fairness and it provides differential disk +access based on priority and class of the task. Just that it is flat and +with cgroup stuff, it needs to be made hierarchical to achive a good +hierarchical control on IO. + +Rest of the IO schedulers (noop, deadline and AS) don't have any notion +of fairness among various threads. They maintain only one queue where all +the IO gets queued (internally this queue is split in read and write queue +for deadline and AS). With this patchset, now we maintain one queue per +cgropu per device and then try to do fair queuing among those queues. + +One of the concerns raised with modifying IO schedulers was that we don't +want to replicate the code in all the IO schedulers. These patches share +the fair queuing code which has been moved to a common layer (elevator +layer). Hence we don't end up replicating code across IO schedulers. Following +diagram depicts the concept. + + -------------------------------- + | Elevator Layer + Fair Queuing | + -------------------------------- + | | | | + NOOP DEADLINE AS CFQ + +Design +====== +This patchset primarily uses BFQ (Budget Fair Queuing) code to provide +fairness among different IO queues. Fabio and Paolo implemented BFQ which uses +B-WF2Q+ algorithm for fair queuing. + +Why BFQ? + +- Not sure if weighted round robin logic of CFQ can be easily extended for + hierarchical mode. One of the things is that we can not keep dividing + the time slice of parent group among childrens. Deeper we go in hierarchy + time slice will get smaller. + + One of the ways to implement hierarchical support could be to keep track + of virtual time and service provided to queue/group and select a queue/group + for service based on any of the various available algoriths. + + BFQ already had support for hierarchical scheduling, taking those patches + was easier. + +- BFQ was designed to provide tighter bounds/delay w.r.t service provided + to a queue. Delay/Jitter with BFQ is O(1). + + Note: BFQ originally used amount of IO done (number of sectors) as notion + of service provided. IOW, it tried to provide fairness in terms of + actual IO done and not in terms of actual time disk access was + given to a queue. + + This patcheset modified BFQ to provide fairness in time domain because + that's what CFQ does. So idea was try not to deviate too much from + the CFQ behavior initially. + + Providing fairness in time domain makes accounting trciky because + due to command queueing, at one time there might be multiple requests + from different queues and there is no easy way to find out how much + disk time actually was consumed by the requests of a particular + queue. More about this in comments in source code. + +We have taken BFQ code as starting point for providing fairness among groups +because it already contained lots of features which we required to implement +hierarhical IO scheduling. With this patch set, I am not trying to ensure O(1) +delay here as my goal is to provide fairness among groups. Most likely that +will mean that latencies are not worse than what cfq currently provides (if +not improved ones). Once fairness is ensured, one can look into more in +ensuring O(1) latencies. + +From data structure point of view, one can think of a tree per device, where +io groups and io queues are hanging and are being scheduled using B-WF2Q+ +algorithm. io_queue, is end queue where requests are actually stored and +dispatched from (like cfqq). + +These io queues are primarily created by and managed by end io schedulers +depending on its semantics. For example, noop, deadline and AS ioschedulers +keep one io queues per cgroup and cfqq keeps one io queue per io_context in +a cgroup (apart from async queues). + +A request is mapped to an io group by elevator layer and which io queue it +is mapped to with in group depends on ioscheduler. Currently "current" task +is used to determine the cgroup (hence io group) of the request. Down the +line we need to make use of bio-cgroup patches to map delayed writes to +right group. + +Going back to old behavior +========================== +In new scheme of things essentially we are creating hierarchical fair +queuing logic in elevator layer and chaning IO schedulers to make use of +that logic so that end IO schedulers start supporting hierarchical scheduling. + +Elevator layer continues to support the old interfaces. So even if fair queuing +is enabled at elevator layer, one can have both new hierchical scheduler as +well as old non-hierarchical scheduler operating. + +Also noop, deadline and AS have option of enabling hierarchical scheduling. +If it is selected, fair queuing is done in hierarchical manner. If hierarchical +scheduling is disabled, noop, deadline and AS should retain their existing +behavior. + +CFQ is the only exception where one can not disable fair queuing as it is +needed for provding fairness among various threads even in non-hierarchical +mode. + +Various user visible config options +=================================== +CONFIG_IOSCHED_NOOP_HIER + - Enables hierchical fair queuing in noop. Not selecting this option + leads to old behavior of noop. + +CONFIG_IOSCHED_DEADLINE_HIER + - Enables hierchical fair queuing in deadline. Not selecting this + option leads to old behavior of deadline. + +CONFIG_IOSCHED_AS_HIER + - Enables hierchical fair queuing in AS. Not selecting this option + leads to old behavior of AS. + +CONFIG_IOSCHED_CFQ_HIER + - Enables hierarchical fair queuing in CFQ. Not selecting this option + still does fair queuing among various queus but it is flat and not + hierarchical. + +CGROUP_BLKIO + - This option enables blkio-cgroup controller for IO tracking + purposes. That means, by this controller one can attribute a write + to the original cgroup and not assume that it belongs to submitting + thread. + +CONFIG_TRACK_ASYNC_CONTEXT + - Currently CFQ attributes the writes to the submitting thread and + caches the async queue pointer in the io context of the process. + If this option is set, it tells cfq and elevator fair queuing logic + that for async writes make use of IO tracking patches and attribute + writes to original cgroup and not to write submitting thread. + + This should be primarily useful when lots of asynchronous writes + are being submitted by pdflush threads and we need to assign the + writes to right group. + +CONFIG_DEBUG_GROUP_IOSCHED + - Throws extra debug messages in blktrace output helpful in doing + doing debugging in hierarchical setup. + + - Also allows for export of extra debug statistics like group queue + and dequeue statistics on device through cgroup interface. + +Config options selected automatically +===================================== +These config options are not user visible and are selected/deselected +automatically based on IO scheduler configurations. + +CONFIG_ELV_FAIR_QUEUING + - Enables/Disables the fair queuing logic at elevator layer. + +CONFIG_GROUP_IOSCHED + - Enables/Disables hierarchical queuing and associated cgroup bits. + +HOWTO +===== +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable hierarchical scheduling in io scheuduler of your choice (say cfq). + CONFIG_IOSCHED_CFQ_HIER=y + +- Enable IO tracking for async writes. + CONFIG_TRACK_ASYNC_CONTEXT=y + + (This will automatically select CGROUP_BLKIO) + +- Compile and boot into kernel and mount IO controller and blkio io tracking + controller. + + mount -t cgroup -o io,blkio none /cgroup + +- Create two cgroups + mkdir -p /cgroup/test1/ /cgroup/test2 + +- Set weights of group test1 and test2 + echo 1000 > /cgroup/test1/io.weight + echo 500 > /cgroup/test2/io.weight + +- Set "fairness" parameter to 1 at the disk you are testing. + + echo 1 > /sys/block//queue/iosched/fairness + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. Make sure + right io scheduler is being used for the block device where files are + present (the one you compiled in hierarchical mode). + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /cgroup/test1/tasks + cat /cgroup/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /cgroup/test2/tasks + cat /cgroup/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at io.disk_time and io.disk_sectors + files of both test1 and test2 groups. This will tell how much disk time + (in milli seconds), each group got and how many secotors each group + dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Some High Level Test setups +=========================== +One of the use cases of IO controller is to provide some kind of IO isolation +between multiple virtual machines on the same host. Following is one +example setup which worked for me. + + + KVM KVM + Guest1 Guest2 + --------- ---------- + | ----- | | ------ | + | | vdb | | | | vdb | | + | ----- | | ------ | + --------- ---------- + + --------------------------- + | Host | + | ------------- | + | | sdb1 | sdb2 | | + | ------------- | + --------------------------- + +On host machine, I had a spare SATA disk. I created two partitions sdb1 +and sdb2 and gave this partitions as additional storage to kvm guests. sdb1 +to KVM guest1 and sdb2 KVM guest2. These storage appeared as /dev/vdb in +both the guests. Formatted the /dev/vdb and created ext3 file system and +started a 1G file writeout in both the guests. Before writeout I had created +two cgroups of weight 1000 and 500 and put virtual machines in two different +groups. + +Following is write I started in both the guests. + +dd if=/dev/zero of=/mnt/vdb/zerofile1 bs=4K count=262144 conv=fdatasync + +Following are the results on host with "deadline" scheduler. + +group1 time=8:16 17755 group1 sectors=8:16 2104608 +group2 time=8:16 9649 group2 sectors=8:16 1180480 + +Virtual machine with cgroup weight 1000 got almost double the time of virtual +machine with weight 500. + +What Works and What Does not +============================ +Service differentiation at application level can be noticed only if completely +parallel IO paths are created from application to IO scheduler and there +are no serializations introduced by any intermediate layer. For example, +in some cases file system and page cache layer introduce serialization and +we don't see service difference between higher weight and lower weight +process groups. + +For example, when I start an O_SYNC write out on an ext3 file system (file +is being created newly), I see lots of activity from kjournald. I have not +gone into details yet, but my understanding is that there are lot more +journal commits and kjournald kind of introduces serialization between two +processes. So even if you put these two processes in two different cgroups +with different weights, higher weight process will not see more IO done. + +It does work very well when we bypass filesystem layer and IO is raw. For +example in above virtual machine case, host sees raw synchronous writes +coming from two guest machines and filesystem layer at host is not introducing +any kind of serialization hence we can see the service difference. + +It also works very well for reads even on the same file system as for reads +file system journalling activity does not kick in and we can create parallel +IO paths from application to all the way down to IO scheduler and get more +IO done on the IO path with higher weight. + +Regarding "fairness" parameter +============================== +IO controller has introduced a "fairness" tunable for every io scheduler. +Currently this tunable can assume values 0, 1. + +If fairness is set to 1, then IO controller waits for requests to finish from +previous queue before requests from new queue are dispatched. This helps in +doing better accouting of disk time consumed by a queue. If this is not done +then on a queuing hardware, there can be requests from multiple queues and +we will not have any idea which queue consumed how much of disk time. + +Details of cgroup files +======================= +- io.ioprio_class + - Specifies class of the cgroup (RT, BE, IDLE). This is default io + class of the group on all the devices until and unless overridden by + per device rule. (See io.policy). + + 1 = RT; 2 = BE, 3 = IDLE + +- io.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See io.policy). + + Currently allowed range of weights is from 1 to 1000. + +- io.disk_time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- io.disk_sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- io.disk_queue + - Debugging aid only enabled if CONFIG_DEBUG_GROUP_IOSCHED=y. This + gives the statistics about how many a times a group was queued + on service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was queued on a particular device. + +- io.disk_queue + - Debugging aid only enabled if CONFIG_DEBUG_GROUP_IOSCHED=y. This + gives the statistics about how many a times a group was de-queued + or removed from the service tree of the device. This basically gives + and idea if we can generate enough IO to create continuously + backlogged groups. First two fields specify the major and minor + number of the device and third field specifies the number + of times a group was de-queued on a particular device. + +- io.policy + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight and class as + specified by io.weight and io.ioprio_class. + + Following is the format. + + #echo dev_maj:dev_minor weight ioprio_class > /patch/to/cgroup/io.policy + + weight=0 means removing a policy. + + Examples: + + Configure weight=300 ioprio_class=2 on /dev/hdb (8:16) in this cgroup + # echo 8:16 300 2 > io.policy + # cat io.policy + dev weight class + 8:16 300 2 + + Configure weight=500 ioprio_class=1 on /dev/hda (8:0) in this cgroup + # echo 8:0 500 1 > io.policy + # cat io.policy + dev weight class + 8:0 500 1 + 8:16 300 2 + + Remove the policy for /dev/hda in this cgroup + # echo 8:0 0 1 > io.policy + # cat io.policy + dev weight class + 8:16 300 2 + +About configuring request desriptors +==================================== +Traditionally there are 128 request desriptors allocated per request queue +where io scheduler is operating (/sys/block//queue/nr_requests). If these +request descriptors are exhausted, processes will put to sleep and woken +up once request descriptors are available. + +With io controller and cgroup stuff, one can not afford to allocate requests +from single pool as one group might allocate lots of requests and then tasks +from other groups might be put to sleep and this other group might be a +higher weight group. Hence to make sure that a group always can get the +request descriptors it is entitled to, one needs to make request descriptor +limit per group on every queue. + +A new parameter /sys/block//queue/nr_group_requests has been introduced +and this parameter controlls the maximum number of requests per group. +nr_requests still continues to control total number of request descriptors +on the queue. + +Ideally one should set nr_requests to be following. + +nr_requests = number_of_cgroups * nr_group_requests + +This will make sure that at any point of time nr_group_requests number of +request descriptors will be available for any of the cgroups. + +Currently default nr_requests=512 and nr_group_requests=128. This will make +sure that apart from root group one can create 3 more group without running +into any issues. If one decides to create more cgorus, nr_requests and +nr_group_requests should be adjusted accordingly. + +Issues +====== +- How to do more accurate disk time accounting, especially with CFQ. We + don't start disk time accounting till first request from the queue has + completed. But there are cases like above virtual machine setup, where + closely cooperating threads issue 1-2 request per thread/io context and + expire. That means many a times, queue does not get serviced for a long + period and gets expired immediately after dispatching one request. First + request is free so we are left with queue disk time as zero and that's + problematic. diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 7e803fc..8b507c4 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -2,6 +2,19 @@ if BLOCK menu "IO Schedulers" +config ELV_FAIR_QUEUING + bool + default n + ---help--- + Traditionally only cfq had notion of multiple queues and it did + fair queuing at its own. With the cgroups and need of controlling + IO, now even the simple io schedulers like noop, deadline, as will + have one queue per cgroup and will need hierarchical fair queuing. + Instead of every io scheduler implementing its own fair queuing + logic, this option enables fair queuing in elevator layer so that + other ioschedulers can make use of it. + If unsure, say N. + config IOSCHED_NOOP bool default y @@ -12,6 +25,17 @@ config IOSCHED_NOOP that do their own scheduling and require only minimal assistance from the kernel. +config IOSCHED_NOOP_HIER + bool "Noop Hierarchical Scheduling support" + depends on IOSCHED_NOOP && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in noop. In this mode noop keeps + one IO queue per cgroup instead of a global queue. Elevator + fair queuing logic ensures fairness among various queues. + config IOSCHED_AS tristate "Anticipatory I/O scheduler" default y @@ -21,6 +45,18 @@ config IOSCHED_AS deadline I/O scheduler, it can also be slower in some cases especially some database loads. +config IOSCHED_AS_HIER + bool "Anticipatory Hierarchical Scheduling support" + depends on IOSCHED_AS && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in anticipatory. In this mode + anticipatory keeps one IO queue per cgroup instead of a global + queue. Elevator fair queuing logic ensures fairness among various + queues. + config IOSCHED_DEADLINE tristate "Deadline I/O scheduler" default y @@ -31,8 +67,20 @@ config IOSCHED_DEADLINE a disk at any one time, its behaviour is almost identical to the anticipatory I/O scheduler and so is a good choice. +config IOSCHED_DEADLINE_HIER + bool "Deadline Hierarchical Scheduling support" + depends on IOSCHED_DEADLINE && CGROUPS + select ELV_FAIR_QUEUING + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in deadline. In this mode deadline keeps + one IO queue per cgroup instead of a global queue. Elevator + fair queuing logic ensures fairness among various queues. + config IOSCHED_CFQ tristate "CFQ I/O scheduler" + select ELV_FAIR_QUEUING default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -40,6 +88,14 @@ config IOSCHED_CFQ working environment, suitable for desktop systems. This is the default I/O scheduler. +config IOSCHED_CFQ_HIER + bool "CFQ Hierarchical Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select GROUP_IOSCHED + default n + ---help--- + Enable hierarhical scheduling in cfq. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -68,6 +124,30 @@ config DEFAULT_IOSCHED default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP +config DEBUG_GROUP_IOSCHED + bool "Debug Hierarchical Scheduling support" + depends on CGROUPS && GROUP_IOSCHED + default n + ---help--- + Enable some debugging hooks for hierarchical scheduling support. + Currently it just outputs more information in blktrace output. + +config TRACK_ASYNC_CONTEXT + bool "Determine async request context from bio" + depends on GROUP_IOSCHED + select CGROUP_BLKIO + default n + ---help--- + Normally async request is attributed to the task submitting the + request. With group ioscheduling, for accurate accounting of + async writes, one needs to map the request to original task/cgroup + which originated the request and not the submitter of the request. + + Currently there are generic io tracking patches to provide facility + to map bio to original owner. If this option is set, for async + request, original owner of the bio is decided by using io tracking + patches otherwise we continue to attribute the request to the + submitting thread. endmenu endif diff --git a/block/Makefile b/block/Makefile index 6c54ed0..d545323 100644 --- a/block/Makefile +++ b/block/Makefile @@ -15,3 +15,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_ELV_FAIR_QUEUING) += elevator-fq.o diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6..d412e36 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * See Documentation/block/as-iosched.txt @@ -76,13 +77,8 @@ enum anticipation_status { * or timed out */ }; -struct as_data { - /* - * run time data - */ - - struct request_queue *q; /* the "owner" queue */ - +struct as_queue { + struct io_queue *ioq; /* * requests (as_rq s) are present on both sort_list and fifo_list */ @@ -90,6 +86,28 @@ struct as_data { struct list_head fifo_list[2]; struct request *next_rq[2]; /* next in sort order */ + + /* + * If an as_queue is switched while a batch is running, then we + * store the time left before current batch will expire + */ + long current_batch_time_left; + + /* + * batch data dir when queue was scheduled out. This will be used + * to setup ad->batch_data_dir when queue is scheduled in. + */ + int saved_batch_data_dir; + + unsigned long last_check_fifo[2]; + int write_batch_count; /* max # of reqs in a write batch */ + int current_write_count; /* how many requests left this batch */ + int write_batch_idled; /* has the write batch gone idle? */ + int nr_queued[2]; +}; + +struct as_data { + struct request_queue *q; /* the "owner" queue */ sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ unsigned long exit_prob; /* probability a task will exit while @@ -103,21 +121,17 @@ struct as_data { sector_t new_seek_mean; unsigned long current_batch_expires; - unsigned long last_check_fifo[2]; int changed_batch; /* 1: waiting for old batch to end */ int new_batch; /* 1: waiting on first read complete */ - int batch_data_dir; /* current batch SYNC / ASYNC */ - int write_batch_count; /* max # of reqs in a write batch */ - int current_write_count; /* how many requests left this batch */ - int write_batch_idled; /* has the write batch gone idle? */ enum anticipation_status antic_status; unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ - struct work_struct antic_work; /* Deferred unplugging */ + struct work_struct antic_work; /* Deferred unplugging */ struct io_context *io_context; /* Identify the expected process */ int ioc_finished; /* IO associated with io_context is finished */ int nr_dispatched; + int batch_data_dir; /* current batch SYNC / ASYNC */ /* * settings that change how the i/o scheduler behaves @@ -125,6 +139,9 @@ struct as_data { unsigned long fifo_expire[2]; unsigned long batch_expire[2]; unsigned long antic_expire; + + /* elevator requested a queue switch. */ + int switch_queue; }; /* @@ -146,12 +163,185 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +#define as_log_asq(ad, asq, fmt, args...) \ +{ \ + blk_add_trace_msg((ad)->q, "as %s " fmt, \ + ioq_to_io_group((asq)->ioq)->path, ##args); \ +} +#else +#define as_log_asq(ad, asq, fmt, args...) \ + blk_add_trace_msg((ad)->q, "as " fmt, ##args) +#endif + +#define as_log(ad, fmt, args...) \ + blk_add_trace_msg((ad)->q, "as " fmt, ##args) + static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); +static inline int as_batch_expired(struct as_data *ad, struct as_queue *asq); + +#ifdef CONFIG_IOSCHED_AS_HIER +static void as_save_batch_context(struct as_data *ad, struct as_queue *asq) +{ + /* Save batch data dir */ + asq->saved_batch_data_dir = ad->batch_data_dir; + + if (ad->changed_batch) { + /* + * In case of force expire, we come here. Batch changeover + * has been signalled but we are waiting for all the + * request to finish from previous batch and then start + * the new batch. Can't wait now. Mark that full batch time + * needs to be allocated when this queue is scheduled again. + */ + asq->current_batch_time_left = + ad->batch_expire[ad->batch_data_dir]; + ad->changed_batch = 0; + goto out; + } + + if (ad->new_batch) { + /* + * We should come here only when new_batch has been set + * but no read request has been issued or if it is a forced + * expiry. + * + * In both the cases, new batch has not started yet so + * allocate full batch length for next scheduling opportunity. + * We don't do write batch size adjustment in hierarchical + * AS so that should not be an issue. + */ + asq->current_batch_time_left = + ad->batch_expire[ad->batch_data_dir]; + ad->new_batch = 0; + goto out; + } + + /* Save how much time is left before current batch expires */ + if (as_batch_expired(ad, asq)) + asq->current_batch_time_left = 0; + else { + asq->current_batch_time_left = ad->current_batch_expires + - jiffies; + BUG_ON((asq->current_batch_time_left) < 0); + } + + if (ad->io_context) { + put_io_context(ad->io_context); + ad->io_context = NULL; + } + +out: + as_log_asq(ad, asq, "save batch: dir=%c time_left=%d changed_batch=%d" + " new_batch=%d, antic_status=%d", + ad->batch_data_dir ? 'R' : 'W', + asq->current_batch_time_left, + ad->changed_batch, ad->new_batch, ad->antic_status); + return; +} + +/* + * FIXME: In original AS, read batch's time account started only after when + * first request had completed (if last batch was a write batch). But here + * we might be rescheduling a read batch right away irrespective of the fact + * of disk cache state. + */ +static void as_restore_batch_context(struct as_data *ad, struct as_queue *asq) +{ + /* Adjust the batch expire time */ + if (asq->current_batch_time_left) + ad->current_batch_expires = jiffies + + asq->current_batch_time_left; + /* restore asq batch_data_dir info */ + ad->batch_data_dir = asq->saved_batch_data_dir; + as_log_asq(ad, asq, "restore batch: dir=%c time=%d reads_q=%d" + " writes_q=%d ad->antic_status=%d", + ad->batch_data_dir ? 'R' : 'W', + asq->current_batch_time_left, + asq->nr_queued[1], asq->nr_queued[0], + ad->antic_status); +} + +/* ioq has been set. */ +static void as_active_ioq_set(struct request_queue *q, void *sched_queue, + int coop) +{ + struct as_queue *asq = sched_queue; + struct as_data *ad = q->elevator->elevator_data; + + as_restore_batch_context(ad, asq); +} + +/* + * This is a notification from common layer that it wishes to expire this + * io queue. AS decides whether queue can be expired, if yes, it also + * saves the batch context. + */ +static int as_expire_ioq(struct request_queue *q, void *sched_queue, + int slice_expired, int force) +{ + struct as_data *ad = q->elevator->elevator_data; + int status = ad->antic_status; + struct as_queue *asq = sched_queue; + + as_log_asq(ad, asq, "as_expire_ioq slice_expired=%d, force=%d", + slice_expired, force); + + /* Forced expiry. We don't have a choice */ + if (force) { + as_antic_stop(ad); + /* + * antic_stop() sets antic_status to FINISHED which signifies + * that either we timed out or we found a close request but + * that's not the case here. Start from scratch. + */ + ad->antic_status = ANTIC_OFF; + as_save_batch_context(ad, asq); + ad->switch_queue = 0; + return 1; + } + + /* + * We are waiting for requests to finish from last + * batch. Don't expire the queue now + */ + if (ad->changed_batch) + goto keep_queue; + + /* + * Wait for all requests from existing batch to finish before we + * switch the queue. New queue might change the batch direction + * and this is to be consistent with AS philosophy of not dispatching + * new requests to underlying drive till requests from requests + * from previous batch are completed. + */ + if (ad->nr_dispatched) + goto keep_queue; + + /* + * If AS anticipation is ON, wait for it to finish. + */ + BUG_ON(status == ANTIC_WAIT_REQ); + + if (status == ANTIC_WAIT_NEXT) + goto keep_queue; + + /* We are good to expire the queue. Save batch context */ + as_save_batch_context(ad, asq); + ad->switch_queue = 0; + return 1; + +keep_queue: + /* Mark that elevator requested for queue switch whenever possible */ + ad->switch_queue = 1; + return 0; +} +#endif /* * IO Context helper functions @@ -258,13 +448,14 @@ static void as_put_io_context(struct request *rq) /* * rb tree support functions */ -#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) +#define RQ_RB_ROOT(asq, rq) (&(asq)->sort_list[rq_is_sync((rq))]) static void as_add_rq_rb(struct as_data *ad, struct request *rq) { struct request *alias; + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); - while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { + while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(asq, rq), rq)))) { as_move_to_dispatch(ad, alias); as_antic_stop(ad); } @@ -272,7 +463,9 @@ static void as_add_rq_rb(struct as_data *ad, struct request *rq) static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) { - elv_rb_del(RQ_RB_ROOT(ad, rq), rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); + + elv_rb_del(RQ_RB_ROOT(asq, rq), rq); } /* @@ -366,7 +559,7 @@ as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) * what request to process next. Anticipation works on top of this. */ static struct request * -as_find_next_rq(struct as_data *ad, struct request *last) +as_find_next_rq(struct as_data *ad, struct as_queue *asq, struct request *last) { struct rb_node *rbnext = rb_next(&last->rb_node); struct rb_node *rbprev = rb_prev(&last->rb_node); @@ -382,7 +575,7 @@ as_find_next_rq(struct as_data *ad, struct request *last) else { const int data_dir = rq_is_sync(last); - rbnext = rb_first(&ad->sort_list[data_dir]); + rbnext = rb_first(&asq->sort_list[data_dir]); if (rbnext && rbnext != &last->rb_node) next = rb_entry_rq(rbnext); } @@ -428,6 +621,7 @@ static void as_antic_waitnext(struct as_data *ad) mod_timer(&ad->antic_timer, timeout); ad->antic_status = ANTIC_WAIT_NEXT; + as_log(ad, "antic_waitnext set"); } /* @@ -441,8 +635,10 @@ static void as_antic_waitreq(struct as_data *ad) if (ad->antic_status == ANTIC_OFF) { if (!ad->io_context || ad->ioc_finished) as_antic_waitnext(ad); - else + else { ad->antic_status = ANTIC_WAIT_REQ; + as_log(ad, "antic_waitreq set"); + } } } @@ -454,6 +650,8 @@ static void as_antic_stop(struct as_data *ad) { int status = ad->antic_status; + as_log(ad, "as_antic_stop antic_status=%d", ad->antic_status); + if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { if (status == ANTIC_WAIT_NEXT) del_timer(&ad->antic_timer); @@ -473,6 +671,7 @@ static void as_antic_timeout(unsigned long data) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); + as_log(ad, "as_antic_timeout"); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { struct as_io_context *aic; @@ -651,6 +850,21 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq) struct io_context *ioc; struct as_io_context *aic; +#ifdef CONFIG_IOSCHED_AS_HIER + /* + * If the active asq and rq's asq are not same, then one can not + * break the anticipation. This primarily becomes useful when a + * request is added to a queue which is not being served currently. + */ + if (rq) { + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); + struct as_queue *curr_asq = + elv_active_sched_queue(ad->q->elevator); + + if (asq != curr_asq) + return 0; + } +#endif ioc = ad->io_context; BUG_ON(!ioc); spin_lock(&ioc->lock); @@ -789,9 +1003,10 @@ static int as_can_anticipate(struct as_data *ad, struct request *rq) static void as_update_rq(struct as_data *ad, struct request *rq) { const int data_dir = rq_is_sync(rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); /* keep the next_rq cache up to date */ - ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); + asq->next_rq[data_dir] = as_choose_req(ad, rq, asq->next_rq[data_dir]); /* * have we been anticipating this request? @@ -808,29 +1023,37 @@ static void as_update_rq(struct as_data *ad, struct request *rq) /* * Gathers timings and resizes the write batch automatically */ -static void update_write_batch(struct as_data *ad) +static void update_write_batch(struct as_data *ad, struct request *rq) { unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; long write_time; + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); write_time = (jiffies - ad->current_batch_expires) + batch; if (write_time < 0) write_time = 0; - if (write_time > batch && !ad->write_batch_idled) { + as_log_asq(ad, asq, "upd write: write_time=%d batch=%d" + " write_batch_idled=%d current_write_count=%d", + write_time, batch, asq->write_batch_idled, + asq->current_write_count); + + if (write_time > batch && !asq->write_batch_idled) { if (write_time > batch * 3) - ad->write_batch_count /= 2; + asq->write_batch_count /= 2; else - ad->write_batch_count--; - } else if (write_time < batch && ad->current_write_count == 0) { + asq->write_batch_count--; + } else if (write_time < batch && asq->current_write_count == 0) { if (batch > write_time * 3) - ad->write_batch_count *= 2; + asq->write_batch_count *= 2; else - ad->write_batch_count++; + asq->write_batch_count++; } - if (ad->write_batch_count < 1) - ad->write_batch_count = 1; + if (asq->write_batch_count < 1) + asq->write_batch_count = 1; + + as_log_asq(ad, asq, "upd write count=%d", asq->write_batch_count); } /* @@ -840,6 +1063,7 @@ static void update_write_batch(struct as_data *ad) static void as_completed_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; + struct as_queue *asq = elv_get_sched_queue(q, rq); WARN_ON(!list_empty(&rq->queuelist)); @@ -848,7 +1072,24 @@ static void as_completed_request(struct request_queue *q, struct request *rq) goto out; } + as_log_asq(ad, asq, "complete: reads_q=%d writes_q=%d changed_batch=%d" + " new_batch=%d switch_queue=%d, dir=%c", + asq->nr_queued[1], asq->nr_queued[0], ad->changed_batch, + ad->new_batch, ad->switch_queue, + ad->batch_data_dir ? 'R' : 'W'); + if (ad->changed_batch && ad->nr_dispatched == 1) { + /* + * If this was write batch finishing, adjust the write batch + * length. + * + * Note, write batch length is being calculated upon completion + * of last write request finished and not completion of first + * read request finished in the next batch. + */ + if (ad->batch_data_dir == BLK_RW_SYNC) + update_write_batch(ad, rq); + ad->current_batch_expires = jiffies + ad->batch_expire[ad->batch_data_dir]; kblockd_schedule_work(q, &ad->antic_work); @@ -866,7 +1107,6 @@ static void as_completed_request(struct request_queue *q, struct request *rq) * and writeback caches */ if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { - update_write_batch(ad); ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; ad->new_batch = 0; @@ -885,6 +1125,13 @@ static void as_completed_request(struct request_queue *q, struct request *rq) } as_put_io_context(rq); + + /* + * If elevator requested a queue switch, kick the queue in the + * hope that this is right time for switch. + */ + if (ad->switch_queue) + kblockd_schedule_work(q, &ad->antic_work); out: RQ_SET_STATE(rq, AS_RQ_POSTSCHED); } @@ -901,9 +1148,13 @@ static void as_remove_queued_request(struct request_queue *q, const int data_dir = rq_is_sync(rq); struct as_data *ad = q->elevator->elevator_data; struct io_context *ioc; + struct as_queue *asq = elv_get_sched_queue(q, rq); WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); + BUG_ON(asq->nr_queued[data_dir] <= 0); + asq->nr_queued[data_dir]--; + ioc = RQ_IOC(rq); if (ioc && ioc->aic) { BUG_ON(!atomic_read(&ioc->aic->nr_queued)); @@ -914,8 +1165,8 @@ static void as_remove_queued_request(struct request_queue *q, * Update the "next_rq" cache if we are about to remove its * entry */ - if (ad->next_rq[data_dir] == rq) - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); + if (asq->next_rq[data_dir] == rq) + asq->next_rq[data_dir] = as_find_next_rq(ad, asq, rq); rq_fifo_clear(rq); as_del_rq_rb(ad, rq); @@ -929,23 +1180,23 @@ static void as_remove_queued_request(struct request_queue *q, * * See as_antic_expired comment. */ -static int as_fifo_expired(struct as_data *ad, int adir) +static int as_fifo_expired(struct as_data *ad, struct as_queue *asq, int adir) { struct request *rq; long delta_jif; - delta_jif = jiffies - ad->last_check_fifo[adir]; + delta_jif = jiffies - asq->last_check_fifo[adir]; if (unlikely(delta_jif < 0)) delta_jif = -delta_jif; if (delta_jif < ad->fifo_expire[adir]) return 0; - ad->last_check_fifo[adir] = jiffies; + asq->last_check_fifo[adir] = jiffies; - if (list_empty(&ad->fifo_list[adir])) + if (list_empty(&asq->fifo_list[adir])) return 0; - rq = rq_entry_fifo(ad->fifo_list[adir].next); + rq = rq_entry_fifo(asq->fifo_list[adir].next); return time_after(jiffies, rq_fifo_time(rq)); } @@ -954,7 +1205,7 @@ static int as_fifo_expired(struct as_data *ad, int adir) * as_batch_expired returns true if the current batch has expired. A batch * is a set of reads or a set of writes. */ -static inline int as_batch_expired(struct as_data *ad) +static inline int as_batch_expired(struct as_data *ad, struct as_queue *asq) { if (ad->changed_batch || ad->new_batch) return 0; @@ -964,7 +1215,7 @@ static inline int as_batch_expired(struct as_data *ad) return time_after(jiffies, ad->current_batch_expires); return time_after(jiffies, ad->current_batch_expires) - || ad->current_write_count == 0; + || asq->current_write_count == 0; } /* @@ -973,6 +1224,7 @@ static inline int as_batch_expired(struct as_data *ad) static void as_move_to_dispatch(struct as_data *ad, struct request *rq) { const int data_dir = rq_is_sync(rq); + struct as_queue *asq = elv_get_sched_queue(ad->q, rq); BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); @@ -995,12 +1247,12 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) ad->io_context = NULL; } - if (ad->current_write_count != 0) - ad->current_write_count--; + if (asq->current_write_count != 0) + asq->current_write_count--; } ad->ioc_finished = 0; - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); + asq->next_rq[data_dir] = as_find_next_rq(ad, asq, rq); /* * take it off the sort and fifo list, add to dispatch queue @@ -1014,6 +1266,8 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) if (RQ_IOC(rq) && RQ_IOC(rq)->aic) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); ad->nr_dispatched++; + as_log_asq(ad, asq, "dispatch req dir=%c nr_dispatched = %d", + data_dir ? 'R' : 'W', ad->nr_dispatched); } /* @@ -1024,9 +1278,16 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) static int as_dispatch_request(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; - const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); - const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); struct request *rq; + struct as_queue *asq = elv_select_sched_queue(q, force); + int reads, writes; + + if (!asq) + return 0; + + reads = !list_empty(&asq->fifo_list[BLK_RW_SYNC]); + writes = !list_empty(&asq->fifo_list[BLK_RW_ASYNC]); + if (unlikely(force)) { /* @@ -1042,44 +1303,52 @@ static int as_dispatch_request(struct request_queue *q, int force) ad->changed_batch = 0; ad->new_batch = 0; - while (ad->next_rq[BLK_RW_SYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); + while (asq->next_rq[BLK_RW_SYNC]) { + as_move_to_dispatch(ad, asq->next_rq[BLK_RW_SYNC]); dispatched++; } - ad->last_check_fifo[BLK_RW_SYNC] = jiffies; + asq->last_check_fifo[BLK_RW_SYNC] = jiffies; - while (ad->next_rq[BLK_RW_ASYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); + while (asq->next_rq[BLK_RW_ASYNC]) { + as_move_to_dispatch(ad, asq->next_rq[BLK_RW_ASYNC]); dispatched++; } - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; + asq->last_check_fifo[BLK_RW_ASYNC] = jiffies; + as_log_asq(ad, asq, "forced dispatch"); return dispatched; } /* Signal that the write batch was uncontended, so we can't time it */ if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { - if (ad->current_write_count == 0 || !writes) - ad->write_batch_idled = 1; + if (asq->current_write_count == 0 || !writes) + asq->write_batch_idled = 1; } if (!(reads || writes) || ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT - || ad->changed_batch) + || ad->changed_batch) { + as_log_asq(ad, asq, "no dispatch. read_q=%d, writes_q=%d" + " ad->antic_status=%d, changed_batch=%d," + " switch_queue=%d new_batch=%d", asq->nr_queued[1], + asq->nr_queued[0], ad->antic_status, ad->changed_batch, + ad->switch_queue, ad->new_batch); return 0; + } - if (!(reads && writes && as_batch_expired(ad))) { + if (!(reads && writes && as_batch_expired(ad, asq))) { /* * batch is still running or no reads or no writes */ - rq = ad->next_rq[ad->batch_data_dir]; + rq = asq->next_rq[ad->batch_data_dir]; if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { - if (as_fifo_expired(ad, BLK_RW_SYNC)) + if (as_fifo_expired(ad, asq, BLK_RW_SYNC)) goto fifo_expired; if (as_can_anticipate(ad, rq)) { + as_log_asq(ad, asq, "can_anticipate = 1"); as_antic_waitreq(ad); return 0; } @@ -1099,8 +1368,10 @@ static int as_dispatch_request(struct request_queue *q, int force) * data direction (read / write) */ + as_log_asq(ad, asq, "select a fresh batch and request"); + if (reads) { - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); + BUG_ON(RB_EMPTY_ROOT(&asq->sort_list[BLK_RW_SYNC])); if (writes && ad->batch_data_dir == BLK_RW_SYNC) /* @@ -1113,8 +1384,9 @@ static int as_dispatch_request(struct request_queue *q, int force) ad->changed_batch = 1; } ad->batch_data_dir = BLK_RW_SYNC; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); - ad->last_check_fifo[ad->batch_data_dir] = jiffies; + as_log_asq(ad, asq, "new batch dir is sync"); + rq = rq_entry_fifo(asq->fifo_list[BLK_RW_SYNC].next); + asq->last_check_fifo[ad->batch_data_dir] = jiffies; goto dispatch_request; } @@ -1124,7 +1396,7 @@ static int as_dispatch_request(struct request_queue *q, int force) if (writes) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); + BUG_ON(RB_EMPTY_ROOT(&asq->sort_list[BLK_RW_ASYNC])); if (ad->batch_data_dir == BLK_RW_SYNC) { ad->changed_batch = 1; @@ -1137,10 +1409,11 @@ dispatch_writes: ad->new_batch = 0; } ad->batch_data_dir = BLK_RW_ASYNC; - ad->current_write_count = ad->write_batch_count; - ad->write_batch_idled = 0; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; + as_log_asq(ad, asq, "new batch dir is async"); + asq->current_write_count = asq->write_batch_count; + asq->write_batch_idled = 0; + rq = rq_entry_fifo(asq->fifo_list[BLK_RW_ASYNC].next); + asq->last_check_fifo[BLK_RW_ASYNC] = jiffies; goto dispatch_request; } @@ -1152,9 +1425,9 @@ dispatch_request: * If a request has expired, service it. */ - if (as_fifo_expired(ad, ad->batch_data_dir)) { + if (as_fifo_expired(ad, asq, ad->batch_data_dir)) { fifo_expired: - rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); + rq = rq_entry_fifo(asq->fifo_list[ad->batch_data_dir].next); } if (ad->changed_batch) { @@ -1172,6 +1445,9 @@ fifo_expired: ad->changed_batch = 0; } + if (ad->switch_queue) + return 0; + /* * rq is the selected appropriate request. */ @@ -1187,6 +1463,7 @@ static void as_add_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; int data_dir; + struct as_queue *asq = elv_get_sched_queue(q, rq); RQ_SET_STATE(rq, AS_RQ_NEW); @@ -1194,6 +1471,11 @@ static void as_add_request(struct request_queue *q, struct request *rq) rq->elevator_private = as_get_io_context(q->node); + asq->nr_queued[data_dir]++; + as_log_asq(ad, asq, "add a %c request read_q=%d write_q=%d", + data_dir ? 'R' : 'W', asq->nr_queued[1], + asq->nr_queued[0]); + if (RQ_IOC(rq)) { as_update_iohist(ad, RQ_IOC(rq)->aic, rq); atomic_inc(&RQ_IOC(rq)->aic->nr_queued); @@ -1205,7 +1487,7 @@ static void as_add_request(struct request_queue *q, struct request *rq) * set expire time and add to fifo list */ rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); - list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &asq->fifo_list[data_dir]); as_update_rq(ad, rq); /* keep state machine up to date */ RQ_SET_STATE(rq, AS_RQ_QUEUED); @@ -1227,31 +1509,20 @@ static void as_deactivate_request(struct request_queue *q, struct request *rq) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); } -/* - * as_queue_empty tells us if there are requests left in the device. It may - * not be the case that a driver can get the next request even if the queue - * is not empty - it is used in the block layer to check for plugging and - * merging opportunities - */ -static int as_queue_empty(struct request_queue *q) -{ - struct as_data *ad = q->elevator->elevator_data; - - return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) - && list_empty(&ad->fifo_list[BLK_RW_SYNC]); -} - static int as_merge(struct request_queue *q, struct request **req, struct bio *bio) { - struct as_data *ad = q->elevator->elevator_data; sector_t rb_key = bio->bi_sector + bio_sectors(bio); struct request *__rq; + struct as_queue *asq = elv_get_sched_queue_bio(q, bio); + + if (!asq) + return ELEVATOR_NO_MERGE; /* * check for front merge */ - __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key); + __rq = elv_rb_find(&asq->sort_list[bio_data_dir(bio)], rb_key); if (__rq && elv_rq_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; @@ -1334,6 +1605,42 @@ static int as_may_queue(struct request_queue *q, int rw) return ret; } +/* Called with queue lock held */ +static void *as_alloc_as_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) +{ + struct as_queue *asq; + struct as_data *ad = eq->elevator_data; + + asq = kmalloc_node(sizeof(*asq), gfp_mask | __GFP_ZERO, q->node); + if (asq == NULL) + goto out; + + INIT_LIST_HEAD(&asq->fifo_list[BLK_RW_SYNC]); + INIT_LIST_HEAD(&asq->fifo_list[BLK_RW_ASYNC]); + asq->sort_list[BLK_RW_SYNC] = RB_ROOT; + asq->sort_list[BLK_RW_ASYNC] = RB_ROOT; + if (ad) + asq->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; + else + asq->write_batch_count = default_write_batch_expire / 10; + + if (asq->write_batch_count < 2) + asq->write_batch_count = 2; + asq->ioq = ioq; +out: + return asq; +} + +static void as_free_as_queue(struct elevator_queue *e, void *sched_queue) +{ + struct as_queue *asq = sched_queue; + + BUG_ON(!list_empty(&asq->fifo_list[BLK_RW_SYNC])); + BUG_ON(!list_empty(&asq->fifo_list[BLK_RW_ASYNC])); + kfree(asq); +} + static void as_exit_queue(struct elevator_queue *e) { struct as_data *ad = e->elevator_data; @@ -1341,9 +1648,6 @@ static void as_exit_queue(struct elevator_queue *e) del_timer_sync(&ad->antic_timer); cancel_work_sync(&ad->antic_work); - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); - put_io_context(ad->io_context); kfree(ad); } @@ -1351,7 +1655,7 @@ static void as_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (as_data). */ -static void *as_init_queue(struct request_queue *q) +static void *as_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct as_data *ad; @@ -1367,10 +1671,6 @@ static void *as_init_queue(struct request_queue *q) init_timer(&ad->antic_timer); INIT_WORK(&ad->antic_work, as_work_handler); - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); - ad->sort_list[BLK_RW_SYNC] = RB_ROOT; - ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; ad->antic_expire = default_antic_expire; @@ -1378,9 +1678,7 @@ static void *as_init_queue(struct request_queue *q) ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; - ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; - if (ad->write_batch_count < 2) - ad->write_batch_count = 2; + ad->switch_queue = 0; return ad; } @@ -1466,6 +1764,12 @@ static struct elv_fs_entry as_attrs[] = { AS_ATTR(antic_expire), AS_ATTR(read_batch_expire), AS_ATTR(write_batch_expire), +#ifdef CONFIG_IOSCHED_AS_HIER + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), + ELV_ATTR(map_sync), +#endif __ATTR_NULL }; @@ -1478,7 +1782,6 @@ static struct elevator_type iosched_as = { .elevator_add_req_fn = as_add_request, .elevator_activate_req_fn = as_activate_request, .elevator_deactivate_req_fn = as_deactivate_request, - .elevator_queue_empty_fn = as_queue_empty, .elevator_completed_req_fn = as_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, @@ -1486,8 +1789,16 @@ static struct elevator_type iosched_as = { .elevator_init_fn = as_init_queue, .elevator_exit_fn = as_exit_queue, .trim = as_trim, + .elevator_alloc_sched_queue_fn = as_alloc_as_queue, + .elevator_free_sched_queue_fn = as_free_as_queue, +#ifdef CONFIG_IOSCHED_AS_HIER + .elevator_expire_ioq_fn = as_expire_ioq, + .elevator_active_ioq_set_fn = as_active_ioq_set, }, - + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, +#else + }, +#endif .elevator_attrs = as_attrs, .elevator_name = "anticipatory", .elevator_owner = THIS_MODULE, diff --git a/block/blk-core.c b/block/blk-core.c index 4b45435..6edf71d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -90,6 +90,27 @@ void blk_queue_congestion_threshold(struct request_queue *q) q->nr_congestion_off = nr; } +#ifdef CONFIG_GROUP_IOSCHED +int blk_queue_io_group_congested(struct backing_dev_info *bdi, int bdi_bits, + struct page *page) +{ + int ret = 0; + struct request_queue *q = bdi->unplug_io_data; + + if (!q || !q->elevator) + return bdi_congested(bdi, bdi_bits); + + /* Do we need to hold queue lock? */ + if (bdi_bits & (1 << BDI_sync_congested)) + ret |= elv_io_group_congested(q, page, 1); + + if (bdi_bits & (1 << BDI_async_congested)) + ret |= elv_io_group_congested(q, page, 0); + + return ret; +} +#endif + /** * blk_get_backing_dev_info - get the address of a queue's backing_dev_info * @bdev: device @@ -460,20 +481,30 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +void blk_init_request_list(struct request_list *rl) { - struct request_list *rl = &q->rq; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; - rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); +} - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, q->node); +static int blk_init_free_list(struct request_queue *q) +{ + /* + * Initialize the queue request list in case there are non-hiearchical + * io schedulers not making use of fair queuing infrastructure. + * + * For ioschedulers making use of fair queuing infrastructure, request + * list is inside the associated group and when that group is + * instanciated, it takes care of initializing the request list also. + */ + blk_init_request_list(&q->rq); + q->rq_data.rq_pool = mempool_create_node(BLKDEV_MIN_RQ, + mempool_alloc_slab, mempool_free_slab, + request_cachep, q->node); - if (!rl->rq_pool) + if (!q->rq_data.rq_pool) return -ENOMEM; return 0; @@ -575,6 +606,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return NULL; } + /* init starved waiter wait queue */ + init_waitqueue_head(&q->rq_data.starved_wait); + /* * if caller didn't supply a lock, they get per-queue locking with * our embedded lock @@ -622,13 +656,14 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) { if (rq->cmd_flags & REQ_ELVPRIV) elv_put_request(q, rq); - mempool_free(rq, q->rq.rq_pool); + mempool_free(rq, q->rq_data.rq_pool); } static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, struct bio *bio, int flags, int priv, + gfp_t gfp_mask) { - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + struct request *rq = mempool_alloc(q->rq_data.rq_pool, gfp_mask); if (!rq) return NULL; @@ -638,8 +673,8 @@ blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) rq->cmd_flags = flags | REQ_ALLOCED; if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); + if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { + mempool_free(rq, q->rq_data.rq_pool); return NULL; } rq->cmd_flags |= REQ_ELVPRIV; @@ -682,18 +717,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_queue *q, int sync, + struct request_list *rl) { - struct request_list *rl = &q->rq; - - if (rl->count[sync] < queue_congestion_off_threshold(q)) + if (q->rq_data.count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); - if (rl->count[sync] + 1 <= q->nr_requests) { + if (q->rq_data.count[sync] + 1 <= q->nr_requests) + blk_clear_queue_full(q, sync); + + if (rl->count[sync] + 1 <= q->nr_group_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - - blk_clear_queue_full(q, sync); } } @@ -701,63 +736,133 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) -{ - struct request_list *rl = &q->rq; +static void freed_request(struct request_queue *q, int sync, int priv, + struct request_list *rl) +{ + /* There is a window during request allocation where request is + * mapped to one group but by the time a queue for the group is + * allocated, it is possible that original cgroup/io group has been + * deleted and now io queue is allocated in a different group (root) + * altogether. + * + * One solution to the problem is that rq should take io group + * reference. But it looks too much to do that to solve this issue. + * The only side affect to the hard to hit issue seems to be that + * we will try to decrement the rl->count for a request list which + * did not allocate that request. Chcek for rl->count going less than + * zero and do not decrement it if that's the case. + */ + + if (priv && rl->count[sync] > 0) + rl->count[sync]--; + + BUG_ON(!q->rq_data.count[sync]); + q->rq_data.count[sync]--; - rl->count[sync]--; if (priv) - rl->elvpriv--; + q->rq_data.elvpriv--; - __freed_request(q, sync); + __freed_request(q, sync, rl); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(q, sync ^ 1, rl); + + /* Wake up the starved process on global list, if any */ + if (unlikely(q->rq_data.starved)) { + if (waitqueue_active(&q->rq_data.starved_wait)) + wake_up(&q->rq_data.starved_wait); + q->rq_data.starved--; + } +} + +/* + * Returns whether one can sleep on this request list or not. There are + * cases (elevator switch) where request list might not have allocated + * any request descriptor but we deny request allocation due to gloabl + * limits. In that case one should sleep on global list as on this request + * list no wakeup will take place. + * + * Also sets the request list starved flag if there are no requests pending + * in the direction of rq. + * + * Return 1 --> sleep on request list, 0 --> sleep on global list + */ +static int can_sleep_on_request_list(struct request_list *rl, int is_sync) +{ + if (unlikely(rl->count[is_sync] == 0)) { + /* + * If there is a request pending in other direction + * in same io group, then set the starved flag of + * the group request list. Otherwise, we need to + * make this process sleep in global starved list + * to make sure it will not sleep indefinitely. + */ + if (rl->count[is_sync ^ 1] != 0) { + rl->starved[is_sync] = 1; + return 1; + } else + return 0; + } + + return 1; } /* * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. + * Returns NULL on failure, with queue_lock held. Also sets the "reason" field + * in case of failure. This reason field helps caller decide to whether sleep + * on per group list or global per queue list. + * reason = 0 sleep on per group list + * reason = 1 sleep on global list + * * Returns !NULL on success, with queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, gfp_t gfp_mask, + struct request_list *rl, int *reason) { struct request *rq = NULL; - struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue, priv; + int sleep_on_global = 0; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; - if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[is_sync]+1 >= q->nr_requests) { - ioc = current_io_context(GFP_ATOMIC, q->node); - /* - * The queue will fill after this allocation, so set - * it as full, and mark this process as "batching". - * This process will be allowed to complete a batch of - * requests, others will be blocked. - */ - if (!blk_queue_full(q, is_sync)) { - ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - goto out; - } + if (q->rq_data.count[is_sync]+1 >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, is_sync); + + /* + * Looks like there is no user of queue full now. + * Keeping it for time being. + */ + if (q->rq_data.count[is_sync]+1 >= q->nr_requests) + blk_set_queue_full(q, is_sync); + + if (rl->count[is_sync]+1 >= q->nr_group_requests) { + ioc = current_io_context(GFP_ATOMIC, q->node); + /* + * The queue request descriptor group will fill after this + * allocation, so set + * it as full, and mark this process as "batching". + * This process will be allowed to complete a batch of + * requests, others will be blocked. + */ + if (rl->count[is_sync] <= q->nr_group_requests) + ioc_set_batching(q, ioc); + else { + if (may_queue != ELV_MQUEUE_MUST + && !ioc_batching(q, ioc)) { + /* + * The queue is full and the allocating + * process is not a "batcher", and not + * exempted by the IO scheduler + */ + goto out; } } - blk_set_queue_congested(q, is_sync); } /* @@ -765,21 +870,60 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests */ - if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) + + if (q->rq_data.count[is_sync] >= (3 * q->nr_requests / 2)) { + /* + * Queue is too full for allocation. On which request queue + * the task should sleep? Generally it should sleep on its + * request list but if elevator switch is happening, in that + * window, request descriptors are allocated from global + * pool and are not accounted against any particular request + * list as group is going away. + * + * So it might happen that request list does not have any + * requests allocated at all and if process sleeps on per + * group request list, it will not be woken up. In such case, + * make it sleep on global starved list. + */ + if (test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags) + || !can_sleep_on_request_list(rl, is_sync)) + sleep_on_global = 1; + goto out; + } + + /* + * Allocation of request is allowed from queue perspective. Now check + * from per group request list + */ + + if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2)) goto out; - rl->count[is_sync]++; rl->starved[is_sync] = 0; + q->rq_data.count[is_sync]++; + priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) - rl->elvpriv++; + if (priv) { + q->rq_data.elvpriv++; + /* + * Account the request to request list only if request is + * going to elevator. During elevator switch, there will + * be small window where group is going away and new group + * will not be allocated till elevator switch is complete. + * So till then instead of slowing down the application, + * we will continue to allocate request from total common + * pool instead of per group limit + */ + rl->count[is_sync]++; + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + rq = blk_alloc_request(q, bio, rw_flags, priv, gfp_mask); + if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -789,7 +933,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + freed_request(q, is_sync, priv, rl); /* * in the very unlikely event that allocation failed and no @@ -799,9 +943,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * rq mempool into READ and WRITE */ rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - + if (!can_sleep_on_request_list(rl, is_sync)) + sleep_on_global = 1; goto out; } @@ -816,6 +959,8 @@ rq_starved: trace_block_getrq(q, bio, rw_flags & 1); out: + if (reason && sleep_on_global) + *reason = 1; return rq; } @@ -829,16 +974,44 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) { const bool is_sync = rw_is_sync(rw_flags) != 0; + int sleep_on_global = 0; struct request *rq; + struct request_list *rl = blk_get_request_list(q, bio); + struct io_group *iog = NULL; - rq = get_request(q, rw_flags, bio, GFP_NOIO); + rq = get_request(q, rw_flags, bio, GFP_NOIO, rl, &sleep_on_global); while (!rq) { DEFINE_WAIT(wait); struct io_context *ioc; - struct request_list *rl = &q->rq; - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); + if (sleep_on_global) { + /* + * Task failed allocation and needs to wait and + * try again. There are no requests pending from + * the io group hence need to sleep on global + * wait queue. Most likely the allocation failed + * because of memory issues. + */ + + q->rq_data.starved++; + prepare_to_wait_exclusive(&q->rq_data.starved_wait, + &wait, TASK_UNINTERRUPTIBLE); + } else { + /* + * We are about to sleep on a request list and we + * drop queue lock. After waking up, we will do + * finish_wait() on request list and in the mean + * time group might be gone. Take a reference to + * the group now. + */ + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, + TASK_UNINTERRUPTIBLE); +#ifdef CONFIG_GROUP_IOSCHED + iog = rl_iog(rl); + if (iog) + elv_get_iog(iog); +#endif + } trace_block_sleeprq(q, bio, rw_flags & 1); @@ -856,9 +1029,30 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, ioc_set_batching(q, ioc); spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); - rq = get_request(q, rw_flags, bio, GFP_NOIO); + if (sleep_on_global) { + finish_wait(&q->rq_data.starved_wait, &wait); + sleep_on_global = 0; + } else { + finish_wait(&rl->wait[is_sync], &wait); +#ifdef CONFIG_GROUP_IOSCHED + /* + * We had taken a reference to the rl/iog. + * Put that now + */ + iog = rl_iog(rl); + if (iog) + elv_put_iog(iog); +#endif + } + + /* + * After the sleep check the rl again in case cgrop bio + * belonged to is gone and it is mapped to root group now + */ + rl = blk_get_request_list(q, bio); + rq = get_request(q, rw_flags, bio, GFP_NOIO, rl, + &sleep_on_global); }; return rq; @@ -867,14 +1061,16 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; + struct request_list *rl; BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); + rl = blk_get_request_list(q, NULL); if (gfp_mask & __GFP_WAIT) { rq = get_request_wait(q, rw, NULL); } else { - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask, rl, NULL); if (!rq) spin_unlock_irq(q->queue_lock); } @@ -1091,12 +1287,13 @@ void __blk_put_request(struct request_queue *q, struct request *req) if (req->cmd_flags & REQ_ALLOCED) { int is_sync = rq_is_sync(req) != 0; int priv = req->cmd_flags & REQ_ELVPRIV; + struct request_list *rl = rq_rl(q, req); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, is_sync, priv); + freed_request(q, is_sync, priv, rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d4ed600..890d475 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,24 +84,31 @@ void exit_io_context(void) } } +void init_io_context(struct io_context *ioc) +{ + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); + ioc->ioprio_changed = 0; + ioc->ioprio = 0; +#ifdef CONFIG_GROUP_IOSCHED + ioc->cgroup_changed = 0; +#endif + ioc->last_waited = jiffies; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + ioc->aic = NULL; + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); + ioc->ioc_data = NULL; +} + struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ret; ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_long_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; - } + if (ret) + init_io_context(ret); return ret; } diff --git a/block/blk-settings.c b/block/blk-settings.c index bd582a7..78b8aec 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -148,6 +148,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) * set defaults */ q->nr_requests = BLKDEV_MAX_RQ; + q->nr_group_requests = BLKDEV_MAX_GROUP_RQ; q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 418d636..ed373b0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -38,42 +38,81 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl = &q->rq; + struct request_list *rl; unsigned long nr; int ret = queue_var_store(&nr, page, count); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; spin_lock_irq(q->queue_lock); + rl = blk_get_request_list(q, NULL); q->nr_requests = nr; blk_queue_congestion_threshold(q); - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + if (q->rq_data.count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + else if (q->rq_data.count[BLK_RW_SYNC] < + queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + if (q->rq_data.count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + else if (q->rq_data.count[BLK_RW_ASYNC] < + queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_ASYNC); - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + if (q->rq_data.count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_SYNC); - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { + } else if (q->rq_data.count[BLK_RW_SYNC]+1 <= q->nr_requests) { blk_clear_queue_full(q, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + if (q->rq_data.count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_ASYNC); - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { + } else if (q->rq_data.count[BLK_RW_ASYNC]+1 <= q->nr_requests) { blk_clear_queue_full(q, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } spin_unlock_irq(q->queue_lock); return ret; } +#ifdef CONFIG_GROUP_IOSCHED +static ssize_t queue_group_requests_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->nr_group_requests, (page)); +} + +extern void elv_io_group_congestion_threshold(struct request_queue *q, + struct io_group *iog); + +static ssize_t +queue_group_requests_store(struct request_queue *q, const char *page, + size_t count) +{ + struct hlist_node *n; + struct io_group *iog; + struct elv_fq_data *efqd; + unsigned long nr; + int ret = queue_var_store(&nr, page, count); + + if (nr < BLKDEV_MIN_RQ) + nr = BLKDEV_MIN_RQ; + + spin_lock_irq(q->queue_lock); + + q->nr_group_requests = nr; + + efqd = &q->elevator->efqd; + + hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) { + elv_io_group_congestion_threshold(q, iog); + } + + spin_unlock_irq(q->queue_lock); + return ret; +} +#endif static ssize_t queue_ra_show(struct request_queue *q, char *page) { @@ -240,6 +279,14 @@ static struct queue_sysfs_entry queue_requests_entry = { .store = queue_requests_store, }; +#ifdef CONFIG_GROUP_IOSCHED +static struct queue_sysfs_entry queue_group_requests_entry = { + .attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR }, + .show = queue_group_requests_show, + .store = queue_group_requests_store, +}; +#endif + static struct queue_sysfs_entry queue_ra_entry = { .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, .show = queue_ra_show, @@ -314,6 +361,9 @@ static struct queue_sysfs_entry queue_iostats_entry = { static struct attribute *default_attrs[] = { &queue_requests_entry.attr, +#ifdef CONFIG_GROUP_IOSCHED + &queue_group_requests_entry.attr, +#endif &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -393,12 +443,11 @@ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - struct request_list *rl = &q->rq; blk_sync_queue(q); - if (rl->rq_pool) - mempool_destroy(rl->rq_pool); + if (q->rq_data.rq_pool) + mempool_destroy(q->rq_data.rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); diff --git a/block/blk.h b/block/blk.h index 3fae6ad..99c3819 100644 --- a/block/blk.h +++ b/block/blk.h @@ -71,6 +71,8 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + elv_fq_activate_rq(q, rq); + if (e->ops->elevator_activate_req_fn) e->ops->elevator_activate_req_fn(q, rq); } @@ -79,6 +81,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq { struct elevator_queue *e = q->elevator; + elv_fq_deactivate_rq(q, rq); + if (e->ops->elevator_deactivate_req_fn) e->ops->elevator_deactivate_req_fn(q, rq); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd7080e..9527f46 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -12,7 +12,6 @@ #include #include #include - /* * tunables */ @@ -23,17 +22,10 @@ static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; static const int cfq_back_max = 16 * 1024; /* penalty of a backwards seek */ static const int cfq_back_penalty = 2; -static const int cfq_slice_sync = HZ / 10; -static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; /* - * offset from end of service tree - */ -#define CFQ_IDLE_DELAY (HZ / 5) - -/* * below this threshold, we consider thinktime immediate */ #define CFQ_MIN_TT (2) @@ -43,7 +35,7 @@ static int cfq_slice_idle = HZ / 125; #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) +#define RQ_CFQQ(rq) (struct cfq_queue *) (ioq_sched_queue((rq)->ioq)) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -53,8 +45,6 @@ static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); #define CFQ_PRIO_LISTS IOPRIO_BE_NR -#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) @@ -74,16 +64,11 @@ struct cfq_rb_root { * Per process-grouping structure */ struct cfq_queue { - /* reference count */ - atomic_t ref; + struct io_queue *ioq; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; /* prio tree member */ struct rb_node p_node; /* prio tree root we belong to, if any */ @@ -99,18 +84,13 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; - unsigned long slice_end; - long slice_resid; unsigned int slice_dispatch; /* pending metadata requests */ int meta_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; + unsigned short org_ioprio, org_ioprio_class; pid_t pid; }; @@ -120,12 +100,6 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; - - /* - * rr list of queues with requests and the count of them - */ - struct cfq_rb_root service_tree; - /* * Each priority tree is sorted by next_request position. These * trees are used when determining if two or more queues are @@ -133,14 +107,6 @@ struct cfq_data { */ struct rb_root prio_trees[CFQ_PRIO_LISTS]; - unsigned int busy_queues; - /* - * Used to track any pending rt requests so we can pre-empt current - * non-RT cfqq in service when this value is non-zero. - */ - unsigned int busy_rt_queues; - - int rq_in_driver; int sync_flight; /* @@ -151,21 +117,8 @@ struct cfq_data { int hw_tag_samples; int rq_in_driver_peak; - /* - * idle window management - */ - struct timer_list idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; struct cfq_io_context *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -175,7 +128,6 @@ struct cfq_data { unsigned int cfq_fifo_expire[2]; unsigned int cfq_back_penalty; unsigned int cfq_back_max; - unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; @@ -188,16 +140,10 @@ struct cfq_data { }; enum cfqq_state_flags { - CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ - CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ - CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ - CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ - CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ }; @@ -215,16 +161,10 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ } -CFQ_CFQQ_FNS(on_rr); -CFQ_CFQQ_FNS(wait_request); -CFQ_CFQQ_FNS(must_dispatch); CFQ_CFQQ_FNS(must_alloc); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); -CFQ_CFQQ_FNS(idle_window); CFQ_CFQQ_FNS(prio_changed); -CFQ_CFQQ_FNS(slice_new); -CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); #undef CFQ_CFQQ_FNS @@ -234,8 +174,8 @@ CFQ_CFQQ_FNS(coop); blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, - struct io_context *, gfp_t); +static struct cfq_queue *cfq_get_queue(struct cfq_data *, struct bio *bio, + int, struct io_context *, gfp_t); static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); @@ -245,84 +185,79 @@ static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, return cic->cfqq[!!is_sync]; } -static inline void cic_set_cfqq(struct cfq_io_context *cic, - struct cfq_queue *cfqq, int is_sync) -{ - cic->cfqq[!!is_sync] = cfqq; -} - /* - * We regard a request as SYNC, if it's either a read or has the SYNC bit - * set (in which case it could also be direct WRITE). + * Determine the cfq queue bio should go in. This is primarily used by + * front merge and allow merge functions. + * + * Currently this function takes the ioprio and iprio_class from task + * submitting async bio. Later save the task information in the page_cgroup + * and retrieve task's ioprio and class from there. */ -static inline int cfq_bio_sync(struct bio *bio) +static struct cfq_queue *cic_bio_to_cfqq(struct cfq_data *cfqd, + struct cfq_io_context *cic, struct bio *bio, int is_sync) { - if (bio_data_dir(bio) == READ || bio_sync(bio)) - return 1; + struct cfq_queue *cfqq = NULL; - return 0; -} + cfqq = cic_to_cfqq(cic, is_sync); -/* - * scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing - */ -static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) -{ - if (cfqd->busy_queues) { - cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + if (!cfqq && !is_sync) { + const int ioprio = task_ioprio(cic->ioc); + const int ioprio_class = task_ioprio_class(cic->ioc); + struct io_group *iog; + /* + * async bio tracking is enabled and we are not caching + * async queue pointer in cic. + */ + iog = io_get_io_group_bio(cfqd->queue, bio, 0); + if (!iog) { + /* + * May be this is first rq/bio and io group has not + * been setup yet. + */ + return NULL; + } + return io_group_async_queue_prio(iog, ioprio_class, ioprio); } +#endif + return cfqq; } -static int cfq_queue_empty(struct request_queue *q) +static inline void cic_set_cfqq(struct cfq_io_context *cic, + struct cfq_queue *cfqq, int is_sync) { - struct cfq_data *cfqd = q->elevator->elevator_data; - - return !cfqd->busy_queues; +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + /* + * Don't cache async queue pointer as now one io context might + * be submitting async io for various different async queues + */ + if (!is_sync) + return; +#endif + cic->cfqq[!!is_sync] = cfqq; } -/* - * Scale schedule slice based on io priority. Use the sync time slice only - * if a queue is marked sync and has sync io queued. A sync queue with async - * io only, should not get full sync slice length. - */ -static inline int cfq_prio_slice(struct cfq_data *cfqd, int sync, - unsigned short prio) +static inline struct io_group *cfqq_to_io_group(struct cfq_queue *cfqq) { - const int base_slice = cfqd->cfq_slice[sync]; - - WARN_ON(prio >= IOPRIO_BE_NR); - - return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); + return ioq_to_io_group(cfqq->ioq); } -static inline int -cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline int cfq_class_idle(struct cfq_queue *cfqq) { - return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); + return elv_ioq_class_idle(cfqq->ioq); } -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static inline int cfq_cfqq_sync(struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; - cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); + return elv_ioq_sync(cfqq->ioq); } -/* - * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end - * isn't valid until the first request from the dispatch is activated - * and the slice time set. - */ -static inline int cfq_slice_used(struct cfq_queue *cfqq) +static inline int cfqq_is_active_queue(struct cfq_queue *cfqq) { - if (cfq_cfqq_slice_new(cfqq)) - return 0; - if (time_before(jiffies, cfqq->slice_end)) - return 0; + struct cfq_data *cfqd = cfqq->cfqd; + struct elevator_queue *e = cfqd->queue->elevator; - return 1; + return (elv_active_sched_queue(e) == cfqq); } /* @@ -421,33 +356,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) } /* - * The below is leftmost cache rbtree addon - */ -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) -{ - if (!root->left) - root->left = rb_first(&root->rb); - - if (root->left) - return rb_entry(root->left, struct cfq_queue, rb_node); - - return NULL; -} - -static void rb_erase_init(struct rb_node *n, struct rb_root *root) -{ - rb_erase(n, root); - RB_CLEAR_NODE(n); -} - -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) -{ - if (root->left == n) - root->left = NULL; - rb_erase_init(n, &root->rb); -} - -/* * would be nice to take fifo expire time into account as well */ static struct request * @@ -460,10 +368,10 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - if (rbprev) + if (rbprev != NULL) prev = rb_entry_rq(rbprev); - if (rbnext) + if (rbnext != NULL) next = rb_entry_rq(rbnext); else { rbnext = rb_first(&cfqq->sort_list); @@ -474,95 +382,6 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, return cfq_choose_req(cfqd, next, prev); } -static unsigned long cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - /* - * just an approximation, should be ok. - */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); -} - -/* - * The cfqd->service_tree holds all pending cfq_queue's that have - * requests waiting to be processed. It is sorted in the order that - * we will service the queues. - */ -static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int add_front) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - unsigned long rb_key; - int left; - - if (cfq_class_idle(cfqq)) { - rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); - if (parent && parent != &cfqq->rb_node) { - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - rb_key += __cfqq->rb_key; - } else - rb_key += jiffies; - } else if (!add_front) { - rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; - rb_key += cfqq->slice_resid; - cfqq->slice_resid = 0; - } else - rb_key = 0; - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - /* - * same position, nothing more to do - */ - if (rb_key == cfqq->rb_key) - return; - - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); - } - - left = 1; - parent = NULL; - p = &cfqd->service_tree.rb.rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - - /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. - */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (rb_key < __cfqq->rb_key) - n = &(*p)->rb_left; - else - n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) - left = 0; - - p = n; - } - - if (left) - cfqd->service_tree.left = &cfqq->rb_node; - - cfqq->rb_key = rb_key; - rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); -} - static struct cfq_queue * cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -624,57 +443,43 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } -/* - * Update cfqq's position in the service tree. - */ -static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) +/* An active ioq is being reset. A chance to do cic related stuff. */ +static void cfq_active_ioq_reset(struct request_queue *q, void *sched_queue) { - /* - * Resorting requires the cfqq to be on the RR list already. - */ - if (cfq_cfqq_on_rr(cfqq)) { - cfq_service_tree_add(cfqd, cfqq, 0); - cfq_prio_tree_add(cfqd, cfqq); - } -} + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = sched_queue; -/* - * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to last request service - */ -static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - cfq_mark_cfqq_on_rr(cfqq); - cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; + if (cfqd->active_cic) { + put_io_context(cfqd->active_cic->ioc); + cfqd->active_cic = NULL; + } - cfq_resort_rr_list(cfqd, cfqq); + /* Resort the cfqq in prio tree */ + if (cfqq) + cfq_prio_tree_add(cfqd, cfqq); } -/* - * Called when the cfqq no longer has requests pending, remove it from - * the service tree. - */ -static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) +/* An ioq has been set as active one. */ +static void cfq_active_ioq_set(struct request_queue *q, void *sched_queue, + int coop) { - cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - cfq_clear_cfqq_on_rr(cfqq); + struct cfq_queue *cfqq = sched_queue; - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } + cfqq->slice_dispatch = 0; + + cfq_clear_cfqq_must_alloc_slice(cfqq); + cfq_clear_cfqq_fifo_expire(cfqq); - BUG_ON(!cfqd->busy_queues); - cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; + /* + * If queue was selected because it was a close cooperator, then + * mark it so that it is not selected again and again. Otherwise + * clear the coop flag so that it becomes eligible to get selected + * again. + */ + if (coop) + cfq_mark_cfqq_coop(cfqq); + else + cfq_clear_cfqq_coop(cfqq); } /* @@ -683,7 +488,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -691,8 +495,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + /* + * If this was last request in the queue, remove this queue from + * prio trees. For last request nr_queued count will still be 1 as + * elevator fair queuing layer is yet to do the accounting. + */ + if (elv_ioq_nr_queued(cfqq->ioq) == 1) { + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -710,9 +523,6 @@ static void cfq_add_rq_rb(struct request *rq) while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) cfq_dispatch_insert(cfqd->queue, __alias); - if (!cfq_cfqq_on_rr(cfqq)) - cfq_add_cfqq_rr(cfqd, cfqq); - /* * check if this request is a better next-serve candidate */ @@ -746,7 +556,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) if (!cic) return NULL; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_bio_to_cfqq(cfqd, cic, bio, elv_bio_sync(bio)); if (cfqq) { sector_t sector = bio->bi_sector + bio_sectors(bio); @@ -760,23 +570,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver++; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); - cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } -static void cfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); -} - static void cfq_remove_request(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); @@ -843,7 +639,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, /* * Disallow merge of a sync bio into an async request. */ - if (cfq_bio_sync(bio) && !rq_is_sync(rq)) + if (elv_bio_sync(bio) && !rq_is_sync(rq)) return 0; /* @@ -854,100 +650,28 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, if (!cic) return 0; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_bio_to_cfqq(cfqd, cic, bio, elv_bio_sync(bio)); if (cfqq == RQ_CFQQ(rq)) return 1; return 0; } -static void __cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); - cfqq->slice_end = 0; - cfqq->slice_dispatch = 0; - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_must_dispatch(cfqq); - cfq_clear_cfqq_must_alloc_slice(cfqq); - cfq_clear_cfqq_fifo_expire(cfqq); - cfq_mark_cfqq_slice_new(cfqq); - - del_timer(&cfqd->idle_slice_timer); - } - - cfqd->active_queue = cfqq; -} - /* * current cfqq expired its slice (or was too idle), select new one */ static void -__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - int timed_out) +__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); - - if (cfq_cfqq_wait_request(cfqq)) - del_timer(&cfqd->idle_slice_timer); - - cfq_clear_cfqq_wait_request(cfqq); - - /* - * store what was left of this slice, if the queue idled/timed out - */ - if (timed_out && !cfq_cfqq_slice_new(cfqq)) { - cfqq->slice_resid = cfqq->slice_end - jiffies; - cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); - } - - cfq_resort_rr_list(cfqd, cfqq); - - if (cfqq == cfqd->active_queue) - cfqd->active_queue = NULL; - - if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->ioc); - cfqd->active_cic = NULL; - } + __elv_ioq_slice_expired(cfqd->queue, cfqq->ioq); } -static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) +static inline void cfq_slice_expired(struct cfq_data *cfqd) { - struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_queue *cfqq = elv_active_sched_queue(cfqd->queue->elevator); if (cfqq) - __cfq_slice_expired(cfqd, cfqq, timed_out); -} - -/* - * Get next queue for service. Unless we have a queue preemption, - * we'll simply select the first cfqq in the service tree. - */ -static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) -{ - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) - return NULL; - - return cfq_rb_first(&cfqd->service_tree); -} - -/* - * Get and set a new active queue for service. - */ -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (!cfqq) { - cfqq = cfq_get_next_queue(cfqd); - if (cfqq) - cfq_clear_cfqq_coop(cfqq); - } - - __cfq_set_active_queue(cfqd, cfqq); - return cfqq; + __cfq_slice_expired(cfqd, cfqq); } static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, @@ -1024,11 +748,11 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid * assumption. */ -static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq, - int probe) +static struct io_queue *cfq_close_cooperator(struct request_queue *q, + void *cur_sched_queue) { - struct cfq_queue *cfqq; + struct cfq_queue *cur_cfqq = cur_sched_queue, *cfqq; + struct cfq_data *cfqd = q->elevator->elevator_data; /* * A valid cfq_io_context is necessary to compare requests against @@ -1049,14 +773,13 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (cfq_cfqq_coop(cfqq)) return NULL; - if (!probe) - cfq_mark_cfqq_coop(cfqq); - return cfqq; + return cfqq->ioq; } -static void cfq_arm_slice_timer(struct cfq_data *cfqd) +static void cfq_arm_slice_timer(struct request_queue *q, void *sched_queue) { - struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = sched_queue; struct cfq_io_context *cic; unsigned long sl; @@ -1069,18 +792,18 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); - WARN_ON(cfq_cfqq_slice_new(cfqq)); + WARN_ON(elv_ioq_slice_new(cfqq->ioq)); /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) + if (!cfqd->cfq_slice_idle || !elv_ioq_idle_window(cfqq->ioq)) return; /* * still requests with the driver, don't idle */ - if (cfqd->rq_in_driver) + if (elv_rq_in_driver(q->elevator)) return; /* @@ -1090,7 +813,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) if (!cic || !atomic_read(&cic->ioc->nr_tasks)) return; - cfq_mark_cfqq_wait_request(cfqq); + elv_mark_ioq_wait_request(cfqq->ioq); /* * we don't want to idle for seeks, but we do want to allow @@ -1101,7 +824,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); - mod_timer(&cfqd->idle_slice_timer, jiffies + sl); + elv_mod_idle_slice_timer(q->elevator, jiffies + sl); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); } @@ -1110,13 +833,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) */ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) { - struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); + struct cfq_data *cfqd = q->elevator->elevator_data; - cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfq_log_cfqq(cfqd, cfqq, "dispatch_insert sect=%d", blk_rq_sectors(rq)); cfq_remove_request(rq); - cfqq->dispatched++; elv_dispatch_sort(q, rq); if (cfq_cfqq_sync(cfqq)) @@ -1154,78 +876,11 @@ static inline int cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { const int base_rq = cfqd->cfq_slice_async_rq; + unsigned short ioprio = elv_ioq_ioprio(cfqq->ioq); - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq, *new_cfqq = NULL; - - cfqq = cfqd->active_queue; - if (!cfqq) - goto new_queue; - - /* - * The active queue has run out of time, expire it and select new. - */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) - goto expire; - - /* - * If we have a RT cfqq waiting, then we pre-empt the current non-rt - * cfqq. - */ - if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { - /* - * We simulate this as cfqq timed out so that it gets to bank - * the remaining of its time slice. - */ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - goto new_queue; - } - - /* - * The active queue has requests and isn't expired, allow it to - * dispatch. - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto keep_queue; - - /* - * If another queue has a request waiting within our mean seek - * distance, let it run. The expire code will check for close - * cooperators and put the close queue at the front of the service - * tree. - */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); - if (new_cfqq) - goto expire; - - /* - * No requests pending. If the active queue still has requests in - * flight or is idling for a new request, allow either of these - * conditions to happen (or time out) before selecting a new queue. - */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { - cfqq = NULL; - goto keep_queue; - } + WARN_ON(ioprio >= IOPRIO_BE_NR); -expire: - cfq_slice_expired(cfqd, 0); -new_queue: - cfqq = cfq_set_active_queue(cfqd, new_cfqq); -keep_queue: - return cfqq; + return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - ioprio)); } static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) @@ -1250,12 +905,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = elv_select_sched_queue(cfqd->queue, 1)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); - cfq_slice_expired(cfqd, 0); + /* This probably is redundant now. above loop will should make sure + * that all the busy queues have expired */ + cfq_slice_expired(cfqd); - BUG_ON(cfqd->busy_queues); + BUG_ON(elv_nr_busy_ioq(cfqd->queue->elevator)); cfq_log(cfqd, "forced_dispatch=%d", dispatched); return dispatched; @@ -1301,13 +958,10 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) struct cfq_queue *cfqq; unsigned int max_dispatch; - if (!cfqd->busy_queues) - return 0; - if (unlikely(force)) return cfq_forced_dispatch(cfqd); - cfqq = cfq_select_queue(cfqd); + cfqq = elv_select_sched_queue(q, 0); if (!cfqq) return 0; @@ -1324,7 +978,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) /* * Does this cfqq already have too much IO in flight? */ - if (cfqq->dispatched >= max_dispatch) { + if (elv_ioq_nr_dispatched(cfqq->ioq) >= max_dispatch) { /* * idle queue must always only have a single IO in flight */ @@ -1334,13 +988,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1) + if (elv_nr_busy_ioq(q->elevator) > 1) return 0; /* * we are the only queue, allow up to 4 times of 'quantum' */ - if (cfqq->dispatched >= 4 * max_dispatch) + if (elv_ioq_nr_dispatched(cfqq->ioq) >= 4 * max_dispatch) return 0; } @@ -1349,51 +1003,45 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) */ cfq_dispatch_request(cfqd, cfqq); cfqq->slice_dispatch++; - cfq_clear_cfqq_must_dispatch(cfqq); /* * expire an async queue immediately if it has used up its slice. idle * queue always expire after 1 dispatch round. */ - if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && + if (elv_nr_busy_ioq(q->elevator) > 1 && ((!cfq_cfqq_sync(cfqq) && cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || cfq_class_idle(cfqq))) { - cfqq->slice_end = jiffies + 1; - cfq_slice_expired(cfqd, 0); + cfq_slice_expired(cfqd); } cfq_log(cfqd, "dispatched a request"); return 1; } -/* - * task holds one reference to the queue, dropped when task exits. each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * queue lock must be held here. - */ -static void cfq_put_queue(struct cfq_queue *cfqq) +static void cfq_free_cfq_queue(struct elevator_queue *e, void *sched_queue) { + struct cfq_queue *cfqq = sched_queue; struct cfq_data *cfqd = cfqq->cfqd; - BUG_ON(atomic_read(&cfqq->ref) <= 0); + BUG_ON(!cfqq); - if (!atomic_dec_and_test(&cfqq->ref)) - return; - - cfq_log_cfqq(cfqd, cfqq, "put_queue"); + cfq_log_cfqq(cfqd, cfqq, "free_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + if (unlikely(cfqq_is_active_queue(cfqq))) { + __cfq_slice_expired(cfqd, cfqq); + elv_schedule_dispatch(cfqd->queue); } kmem_cache_free(cfq_pool, cfqq); } +static inline void cfq_put_queue(struct cfq_queue *cfqq) +{ + elv_put_ioq(cfqq->ioq); +} + /* * Must always be called with the rcu_read_lock() held */ @@ -1481,9 +1129,9 @@ static void cfq_free_io_context(struct io_context *ioc) static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + if (unlikely(cfqq == elv_active_sched_queue(cfqd->queue->elevator))) { + __cfq_slice_expired(cfqd, cfqq); + elv_schedule_dispatch(cfqd->queue); } cfq_put_queue(cfqq); @@ -1553,9 +1201,10 @@ static struct cfq_io_context * cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct cfq_io_context *cic; + struct request_queue *q = cfqd->queue; cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, - cfqd->queue->node); + q->node); if (cic) { cic->last_end_request = jiffies; INIT_LIST_HEAD(&cic->queue_list); @@ -1571,7 +1220,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) { struct task_struct *tsk = current; - int ioprio_class; + int ioprio_class, ioprio; if (!cfq_cfqq_prio_changed(cfqq)) return; @@ -1584,30 +1233,33 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) /* * no prio set, inherit CPU scheduling settings */ - cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = task_nice_ioclass(tsk); + ioprio = task_nice_ioprio(tsk); + ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_RT; + ioprio = task_ioprio(ioc); + ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - cfqq->ioprio = task_ioprio(ioc); - cfqq->ioprio_class = IOPRIO_CLASS_BE; + ioprio = task_ioprio(ioc); + ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: - cfqq->ioprio_class = IOPRIO_CLASS_IDLE; - cfqq->ioprio = 7; - cfq_clear_cfqq_idle_window(cfqq); + ioprio_class = IOPRIO_CLASS_IDLE; + ioprio = 7; + elv_clear_ioq_idle_window(cfqq->ioq); break; } + elv_ioq_set_ioprio_class(cfqq->ioq, ioprio_class); + elv_ioq_set_ioprio(cfqq->ioq, ioprio); + /* * keep track of original prio settings in case we have to temporarily * elevate the priority of this queue */ - cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; + cfqq->org_ioprio = ioprio; + cfqq->org_ioprio_class = ioprio_class; cfq_clear_cfqq_prio_changed(cfqq); } @@ -1616,28 +1268,43 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) struct cfq_data *cfqd = cic->key; struct cfq_queue *cfqq; unsigned long flags; + struct request_queue *q = cfqd->queue; if (unlikely(!cfqd)) return; - spin_lock_irqsave(cfqd->queue->queue_lock, flags); + spin_lock_irqsave(q->queue_lock, flags); cfqq = cic->cfqq[BLK_RW_ASYNC]; + if (cfqq) { struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, + + /* + * Drop the reference to old queue unconditionally. Don't + * worry whether new async prio queue has been allocated + * or not. + */ + cic_set_cfqq(cic, NULL, BLK_RW_ASYNC); + cfq_put_queue(cfqq); + + /* + * Why to allocate new queue now? Will it not be automatically + * allocated whenever another async request from same context + * comes? Keeping it for the time being because existing cfq + * code allocates the new queue immediately upon prio change + */ + new_cfqq = cfq_get_queue(cfqd, NULL, BLK_RW_ASYNC, cic->ioc, GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + if (new_cfqq) + cic_set_cfqq(cic, new_cfqq, BLK_RW_ASYNC); } cfqq = cic->cfqq[BLK_RW_SYNC]; if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + spin_unlock_irqrestore(q->queue_lock, flags); } static void cfq_ioc_set_ioprio(struct io_context *ioc) @@ -1649,42 +1316,136 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc) static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, pid_t pid, int is_sync) { - RB_CLEAR_NODE(&cfqq->rb_node); RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); - atomic_set(&cfqq->ref, 0); cfqq->cfqd = cfqd; cfq_mark_cfqq_prio_changed(cfqq); if (is_sync) { if (!cfq_class_idle(cfqq)) - cfq_mark_cfqq_idle_window(cfqq); - cfq_mark_cfqq_sync(cfqq); + elv_mark_ioq_idle_window(cfqq->ioq); + elv_mark_ioq_sync(cfqq->ioq); } cfqq->pid = pid; } +#ifdef CONFIG_IOSCHED_CFQ_HIER +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *async_cfqq = cic_to_cfqq(cic, 0); + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + struct io_group *iog, *__iog; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + iog = io_get_io_group(q, NULL, 0); + + if (async_cfqq != NULL) { + __iog = cfqq_to_io_group(async_cfqq); + if (iog != __iog) { + /* cgroup changed, drop the reference to async queue */ + cic_set_cfqq(cic, NULL, 0); + cfq_put_queue(async_cfqq); + } + } + + if (sync_cfqq != NULL) { + __iog = cfqq_to_io_group(sync_cfqq); + + /* + * Drop reference to sync queue. A new sync queue will + * be assigned in new group upon arrival of a fresh request. + * If old queue has got requests, those reuests will be + * dispatched over a period of time and queue will be freed + * automatically. + */ + if (iog != __iog) { + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_IOSCHED_CFQ_HIER */ + static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, +cfq_find_alloc_queue(struct cfq_data *cfqd, struct bio *bio, int is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct request_queue *q = cfqd->queue; + struct io_queue *ioq = NULL, *new_ioq = NULL; + struct io_group *iog = NULL; retry: + iog = io_get_io_group_bio(q, bio, 1); + cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + if (!cfqq && !is_sync) { + const int ioprio = task_ioprio(cic->ioc); + const int ioprio_class = task_ioprio_class(cic->ioc); + + /* + * We have not cached async queue pointer as bio tracking + * is enabled. Look into group async queue array using ioc + * class and prio to see if somebody already allocated the + * queue. + */ + + cfqq = io_group_async_queue_prio(iog, ioprio_class, ioprio); + } +#endif + /* * Always try a new alloc if we fell back to the OOM cfqq * originally, since it should just be a temporary situation. */ if (!cfqq || cfqq == &cfqd->oom_cfqq) { + /* Allocate ioq object first and then cfqq */ + if (new_ioq) { + goto alloc_cfqq; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(cfqd->queue->queue_lock); + new_ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + spin_lock_irq(cfqd->queue->queue_lock); + if (new_ioq) + goto retry; + } else + ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + +alloc_cfqq: + if (!ioq && !new_ioq) { + /* ioq allocation failed. Deafult to oom_cfqq */ + cfqq = &cfqd->oom_cfqq; + goto out; + } + cfqq = NULL; if (new_cfqq) { + ioq = new_ioq; + new_ioq = NULL; cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { @@ -1702,60 +1463,83 @@ retry: } if (cfqq) { + elv_init_ioq(q->elevator, ioq, current->pid, is_sync); + elv_init_ioq_io_group(q->elevator, ioq, iog); + elv_init_ioq_prio_data(q->elevator, ioq, + IOPRIO_CLASS_BE, IOPRIO_NORM); + elv_init_ioq_sched_queue(q->elevator, ioq, cfqq); + + cfqq->ioq = ioq; cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + + /* ioq reference on iog */ + elv_get_iog(iog); cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else + } else { cfqq = &cfqd->oom_cfqq; + /* If ioq allocation was successful, free it up */ + if (ioq) + elv_free_ioq(ioq); + } } + if (new_ioq) + elv_free_ioq(new_ioq); + if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); +out: return cfqq; } -static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; - case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; - default: - BUG(); - } -} - static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, - gfp_t gfp_mask) +cfq_get_queue(struct cfq_data *cfqd, struct bio *bio, int is_sync, + struct io_context *ioc, gfp_t gfp_mask) { const int ioprio = task_ioprio(ioc); const int ioprio_class = task_ioprio_class(ioc); - struct cfq_queue **async_cfqq = NULL; + struct cfq_queue *async_cfqq = NULL; struct cfq_queue *cfqq = NULL; + struct io_group *iog = io_get_io_group_bio(cfqd->queue, bio, 1); if (!is_sync) { - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); - cfqq = *async_cfqq; + async_cfqq = io_group_async_queue_prio(iog, ioprio_class, + ioprio); + cfqq = async_cfqq; } if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, bio, is_sync, ioc, gfp_mask); + + if (!is_sync && !async_cfqq) + io_group_set_async_queue(iog, ioprio_class, ioprio, cfqq->ioq); +#ifdef CONFIG_TRACK_ASYNC_CONTEXT /* - * pin the queue now that it's allocated, scheduler exit will prune it + * ioc reference. If async request queue/group is determined from the + * original task/cgroup and not from submitter task, io context can + * not cache the pointer to async queue and everytime a request comes, + * it will be determined by going through the async queue array. + * + * This comes from the fact that we might be getting async requests + * which belong to a different cgroup altogether than the cgroup + * iocontext belongs to. And this thread might be submitting bios + * from various cgroups. So every time async queue will be different + * based on the cgroup of the bio/rq. Can't cache the async cfqq + * pointer in cic. */ - if (!is_sync && !(*async_cfqq)) { - atomic_inc(&cfqq->ref); - *async_cfqq = cfqq; - } - - atomic_inc(&cfqq->ref); + if (is_sync) + elv_get_ioq(cfqq->ioq); +#else + /* + * async requests are being attributed to task submitting + * it, hence cic can cache async cfqq pointer. Take the + * queue reference even for async queue. + */ + elv_get_ioq(cfqq->ioq); +#endif return cfqq; } @@ -1834,6 +1618,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, { unsigned long flags; int ret; + struct request_queue *q = cfqd->queue; ret = radix_tree_preload(gfp_mask); if (!ret) { @@ -1850,9 +1635,9 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, radix_tree_preload_end(); if (!ret) { - spin_lock_irqsave(cfqd->queue->queue_lock, flags); + spin_lock_irqsave(q->queue_lock, flags); list_add(&cic->queue_list, &cfqd->cic_list); - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); + spin_unlock_irqrestore(q->queue_lock, flags); } } @@ -1872,10 +1657,11 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) { struct io_context *ioc = NULL; struct cfq_io_context *cic; + struct request_queue *q = cfqd->queue; might_sleep_if(gfp_mask & __GFP_WAIT); - ioc = get_io_context(gfp_mask, cfqd->queue->node); + ioc = get_io_context(gfp_mask, q->node); if (!ioc) return NULL; @@ -1894,7 +1680,10 @@ out: smp_read_barrier_depends(); if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); - +#ifdef CONFIG_IOSCHED_CFQ_HIER + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -1960,7 +1749,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) return; - enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); + enable_idle = old_idle = elv_ioq_idle_window(cfqq->ioq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (cfqd->hw_tag && CIC_SEEKY(cic))) @@ -1975,27 +1764,28 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (old_idle != enable_idle) { cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); + elv_mark_ioq_idle_window(cfqq->ioq); else - cfq_clear_cfqq_idle_window(cfqq); + elv_clear_ioq_idle_window(cfqq->ioq); } } /* * Check if new_cfqq should preempt the currently active queue. Return 0 for - * no or if we aren't sure, a 1 will cause a preempt. + * no or if we aren't sure, a 1 will cause a preemption attempt. + * Some of the preemption logic has been moved to common layer. Only cfq + * specific parts are left here. */ static int -cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, - struct request *rq) +cfq_should_preempt(struct request_queue *q, void *new_cfqq, struct request *rq) { - struct cfq_queue *cfqq; + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_queue *cfqq = elv_active_sched_queue(q->elevator); - cfqq = cfqd->active_queue; if (!cfqq) return 0; - if (cfq_slice_used(cfqq)) + if (elv_ioq_slice_used(cfqq->ioq)) return 1; if (cfq_class_idle(new_cfqq)) @@ -2018,13 +1808,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return 1; - - if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) + if (!cfqd->active_cic || !elv_ioq_wait_request(cfqq->ioq)) return 0; /* @@ -2038,29 +1822,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, } /* - * cfqq preempts the active queue. if we allowed preempt with no slice left, - * let it have half of its nominal slice. - */ -static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - - /* - * Put the new queue at the front of the of the current list, - * so we know that it will be selected next. - */ - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - - cfq_service_tree_add(cfqd, cfqq, 1); - - cfqq->slice_end = 0; - cfq_mark_cfqq_slice_new(cfqq); -} - -/* * Called when a new fs request (rq) is added (to cfqq). Check if there's * something we should do about it + * After enqueuing the request whether queue should be preempted or kicked + * decision is taken by common layer. */ static void cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, @@ -2077,36 +1842,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_update_idle_window(cfqd, cfqq, cic); cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (cfqq == cfqd->active_queue) { - /* - * Remember that we saw a request from this process, but - * don't start queuing just yet. Otherwise we risk seeing lots - * of tiny requests, because we disrupt the normal plugging - * and merging. If the request is already larger than a single - * page, let it rip immediately. For that case we assume that - * merging is already done. Ditto for a busy system that - * has other work pending, don't risk delaying until the - * idle timer unplug to continue working. - */ - if (cfq_cfqq_wait_request(cfqq)) { - if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || - cfqd->busy_queues > 1) { - del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); - } - } else if (cfq_should_preempt(cfqd, cfqq, rq)) { - /* - * not the active queue - expire current slice if it is - * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority or - * this new queue is RT and the current one is BE - */ - cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue); - } } static void cfq_insert_request(struct request_queue *q, struct request *rq) @@ -2130,11 +1865,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + struct elevator_queue *eq = cfqd->queue->elevator; + + if (elv_rq_in_driver(eq) > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = elv_rq_in_driver(eq); if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + elv_rq_in_driver(eq) <= CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -2161,44 +1898,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver); - WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; - cfqq->dispatched--; - if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight--; - if (sync) RQ_CIC(rq)->last_end_request = now; - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (cfqd->active_queue == cfqq) { - const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); - - if (cfq_cfqq_slice_new(cfqq)) { - cfq_set_prio_slice(cfqd, cfqq); - cfq_clear_cfqq_slice_new(cfqq); - } - /* - * If there are no requests waiting in this queue, and - * there are other queues ready to issue requests, AND - * those other queues are issuing requests within our - * mean seek distance, give them a chance to run instead - * of idling. - */ - if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && - sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); - } - - if (!cfqd->rq_in_driver) - cfq_schedule_dispatch(cfqd); } /* @@ -2207,30 +1910,33 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) */ static void cfq_prio_boost(struct cfq_queue *cfqq) { + struct io_queue *ioq = cfqq->ioq; + if (has_fs_excl()) { /* * boost idle prio on transactions that would lock out other * users of the filesystem */ if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; + elv_ioq_set_ioprio_class(ioq, IOPRIO_CLASS_BE); + if (elv_ioq_ioprio(ioq) > IOPRIO_NORM) + elv_ioq_set_ioprio(ioq, IOPRIO_NORM); + } else { /* * check if we need to unboost the queue */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; - if (cfqq->ioprio != cfqq->org_ioprio) - cfqq->ioprio = cfqq->org_ioprio; + if (elv_ioq_ioprio_class(ioq) != cfqq->org_ioprio_class) + elv_ioq_set_ioprio_class(ioq, cfqq->org_ioprio_class); + if (elv_ioq_ioprio(ioq) != cfqq->org_ioprio) + elv_ioq_set_ioprio(ioq, cfqq->org_ioprio); } } static inline int __cfq_may_queue(struct cfq_queue *cfqq) { - if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && - !cfq_cfqq_must_alloc_slice(cfqq)) { + if ((elv_ioq_wait_request(cfqq->ioq) || + cfq_cfqq_must_alloc(cfqq)) && !cfq_cfqq_must_alloc_slice(cfqq)) { cfq_mark_cfqq_must_alloc_slice(cfqq); return ELV_MQUEUE_MUST; } @@ -2282,7 +1988,7 @@ static void cfq_put_request(struct request *rq) put_io_context(RQ_CIC(rq)->ioc); rq->elevator_private = NULL; - rq->elevator_private2 = NULL; + rq->ioq = NULL; cfq_put_queue(cfqq); } @@ -2292,7 +1998,8 @@ static void cfq_put_request(struct request *rq) * Allocate cfq data structures associated with this request. */ static int -cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, + gfp_t gfp_mask) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_context *cic; @@ -2312,125 +2019,37 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); + cfqq = cfq_get_queue(cfqd, bio, is_sync, cic->ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } cfqq->allocated[rw]++; cfq_clear_cfqq_must_alloc(cfqq); - atomic_inc(&cfqq->ref); + elv_get_ioq(cfqq->ioq); spin_unlock_irqrestore(q->queue_lock, flags); rq->elevator_private = cic; - rq->elevator_private2 = cfqq; + rq->ioq = cfqq->ioq; return 0; queue_fail: if (cic) put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); + elv_schedule_dispatch(cfqd->queue); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); return 1; } -static void cfq_kick_queue(struct work_struct *work) -{ - struct cfq_data *cfqd = - container_of(work, struct cfq_data, unplug_work); - struct request_queue *q = cfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue); - spin_unlock_irq(q->queue_lock); -} - -/* - * Timer running if the active_queue is currently idling inside its time slice - */ -static void cfq_idle_slice_timer(unsigned long data) -{ - struct cfq_data *cfqd = (struct cfq_data *) data; - struct cfq_queue *cfqq; - unsigned long flags; - int timed_out = 1; - - cfq_log(cfqd, "idle timer fired"); - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - - cfqq = cfqd->active_queue; - if (cfqq) { - timed_out = 0; - - /* - * We saw a request before the queue expired, let it through - */ - if (cfq_cfqq_must_dispatch(cfqq)) - goto out_kick; - - /* - * expired - */ - if (cfq_slice_used(cfqq)) - goto expire; - - /* - * only expire and reinvoke request handler, if there are - * other queues with pending requests - */ - if (!cfqd->busy_queues) - goto out_cont; - - /* - * not expired and it has a request pending, let it dispatch - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto out_kick; - } -expire: - cfq_slice_expired(cfqd, timed_out); -out_kick: - cfq_schedule_dispatch(cfqd); -out_cont: - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -} - -static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) -{ - del_timer_sync(&cfqd->idle_slice_timer); - cancel_work_sync(&cfqd->unplug_work); -} - -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; - cfq_shutdown_timer_wq(cfqd); - spin_lock_irq(q->queue_lock); - if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - while (!list_empty(&cfqd->cic_list)) { struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, struct cfq_io_context, @@ -2439,16 +2058,11 @@ static void cfq_exit_queue(struct elevator_queue *e) __cfq_exit_single_io_context(cfqd, cic); } - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); - - cfq_shutdown_timer_wq(cfqd); - kfree(cfqd); } -static void *cfq_init_queue(struct request_queue *q) +static void *cfq_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct cfq_data *cfqd; int i; @@ -2457,8 +2071,6 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; - /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -2473,28 +2085,22 @@ static void *cfq_init_queue(struct request_queue *q) * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - atomic_inc(&cfqd->oom_cfqq.ref); + + /* Link up oom_ioq and oom_cfqq */ + cfqd->oom_cfqq.ioq = elv_get_oom_ioq(eq); + elv_init_ioq_sched_queue(eq, elv_get_oom_ioq(eq), &cfqd->oom_cfqq); INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; - init_timer(&cfqd->idle_slice_timer); - cfqd->idle_slice_timer.function = cfq_idle_slice_timer; - cfqd->idle_slice_timer.data = (unsigned long) cfqd; - - INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); - cfqd->cfq_quantum = cfq_quantum; cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; cfqd->cfq_back_max = cfq_back_max; cfqd->cfq_back_penalty = cfq_back_penalty; - cfqd->cfq_slice[0] = cfq_slice_async; - cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; - cfqd->hw_tag = 1; return cfqd; } @@ -2560,8 +2166,6 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); -SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); -SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); #undef SHOW_FUNCTION @@ -2590,8 +2194,6 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); #undef STORE_FUNCTION @@ -2605,10 +2207,14 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(fifo_expire_async), CFQ_ATTR(back_seek_max), CFQ_ATTR(back_seek_penalty), - CFQ_ATTR(slice_sync), - CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + ELV_ATTR(slice_sync), + ELV_ATTR(slice_async), +#ifdef CONFIG_GROUP_IOSCHED + ELV_ATTR(group_idle), + ELV_ATTR(fairness), +#endif __ATTR_NULL }; @@ -2621,8 +2227,6 @@ static struct elevator_type iosched_cfq = { .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, - .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, @@ -2632,7 +2236,14 @@ static struct elevator_type iosched_cfq = { .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, .trim = cfq_free_io_context, + .elevator_free_sched_queue_fn = cfq_free_cfq_queue, + .elevator_active_ioq_set_fn = cfq_active_ioq_set, + .elevator_active_ioq_reset_fn = cfq_active_ioq_reset, + .elevator_arm_slice_timer_fn = cfq_arm_slice_timer, + .elevator_should_preempt_fn = cfq_should_preempt, + .elevator_close_cooperator_fn = cfq_close_cooperator, }, + .elevator_features = ELV_IOSCHED_NEED_FQ, .elevator_attrs = cfq_attrs, .elevator_name = "cfq", .elevator_owner = THIS_MODULE, @@ -2640,14 +2251,6 @@ static struct elevator_type iosched_cfq = { static int __init cfq_init(void) { - /* - * could be 0 on HZ < 1000 setups - */ - if (!cfq_slice_async) - cfq_slice_async = 1; - if (!cfq_slice_idle) - cfq_slice_idle = 1; - if (cfq_slab_setup()) return -ENOMEM; diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b547cbc..b69c29f 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -23,25 +23,23 @@ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ -struct deadline_data { - /* - * run time data - */ - +struct deadline_queue { /* * requests (deadline_rq s) are present on both sort_list and fifo_list */ - struct rb_root sort_list[2]; + struct rb_root sort_list[2]; struct list_head fifo_list[2]; - /* * next in sort order. read, write or both are NULL */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ +}; +struct deadline_data { + struct request_queue *q; + sector_t last_sector; /* head position */ /* * settings that change how the i/o scheduler behaves */ @@ -56,7 +54,9 @@ static void deadline_move_request(struct deadline_data *, struct request *); static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); + + return &dq->sort_list[rq_data_dir(rq)]; } /* @@ -87,9 +87,10 @@ static inline void deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); + if (dq->next_rq[data_dir] == rq) + dq->next_rq[data_dir] = deadline_latter_request(rq); elv_rb_del(deadline_rb_root(dd, rq), rq); } @@ -102,6 +103,7 @@ deadline_add_request(struct request_queue *q, struct request *rq) { struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(q, rq); deadline_add_rq_rb(dd, rq); @@ -109,7 +111,7 @@ deadline_add_request(struct request_queue *q, struct request *rq) * set expire time and add to fifo list */ rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &dq->fifo_list[data_dir]); } /* @@ -129,6 +131,11 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) struct deadline_data *dd = q->elevator->elevator_data; struct request *__rq; int ret; + struct deadline_queue *dq; + + dq = elv_get_sched_queue_bio(q, bio); + if (!dq) + return ELEVATOR_NO_MERGE; /* * check for front merge @@ -136,7 +143,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) if (dd->front_merges) { sector_t sector = bio->bi_sector + bio_sectors(bio); - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&dq->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); @@ -207,10 +214,11 @@ static void deadline_move_request(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); + struct deadline_queue *dq = elv_get_sched_queue(dd->q, rq); - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); + dq->next_rq[READ] = NULL; + dq->next_rq[WRITE] = NULL; + dq->next_rq[data_dir] = deadline_latter_request(rq); dd->last_sector = rq_end_sector(rq); @@ -225,9 +233,9 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline int deadline_check_fifo(struct deadline_queue *dq, int ddir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(dq->fifo_list[ddir].next); /* * rq is expired! @@ -245,20 +253,26 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) static int deadline_dispatch_requests(struct request_queue *q, int force) { struct deadline_data *dd = q->elevator->elevator_data; - const int reads = !list_empty(&dd->fifo_list[READ]); - const int writes = !list_empty(&dd->fifo_list[WRITE]); + struct deadline_queue *dq = elv_select_sched_queue(q, force); + int reads, writes; struct request *rq; int data_dir; + if (!dq) + return 0; + + reads = !list_empty(&dq->fifo_list[READ]); + writes = !list_empty(&dq->fifo_list[WRITE]); + /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; + if (dq->next_rq[WRITE]) + rq = dq->next_rq[WRITE]; else - rq = dd->next_rq[READ]; + rq = dq->next_rq[READ]; - if (rq && dd->batching < dd->fifo_batch) + if (rq && dq->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -268,9 +282,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) */ if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + BUG_ON(RB_EMPTY_ROOT(&dq->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (writes && (dq->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -284,9 +298,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) if (writes) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&dq->sort_list[WRITE])); - dd->starved = 0; + dq->starved = 0; data_dir = WRITE; @@ -299,55 +313,70 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + if (deadline_check_fifo(dq, data_dir) || !dq->next_rq[data_dir]) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(dq->fifo_list[data_dir].next); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = dq->next_rq[data_dir]; } - dd->batching = 0; + dq->batching = 0; dispatch_request: /* * rq is the selected appropriate request. */ - dd->batching++; + dq->batching++; deadline_move_request(dd, rq); return 1; } -static int deadline_queue_empty(struct request_queue *q) +static void *deadline_alloc_deadline_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) { - struct deadline_data *dd = q->elevator->elevator_data; + struct deadline_queue *dq; - return list_empty(&dd->fifo_list[WRITE]) - && list_empty(&dd->fifo_list[READ]); + dq = kmalloc_node(sizeof(*dq), gfp_mask | __GFP_ZERO, q->node); + if (dq == NULL) + goto out; + + INIT_LIST_HEAD(&dq->fifo_list[READ]); + INIT_LIST_HEAD(&dq->fifo_list[WRITE]); + dq->sort_list[READ] = RB_ROOT; + dq->sort_list[WRITE] = RB_ROOT; +out: + return dq; +} + +static void deadline_free_deadline_queue(struct elevator_queue *e, + void *sched_queue) +{ + struct deadline_queue *dq = sched_queue; + + kfree(dq); } static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); - kfree(dd); } /* * initialize elevator private data (deadline_data). */ -static void *deadline_init_queue(struct request_queue *q) +static void * +deadline_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; @@ -355,10 +384,7 @@ static void *deadline_init_queue(struct request_queue *q) if (!dd) return NULL; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; + dd->q = q; dd->fifo_expire[READ] = read_expire; dd->fifo_expire[WRITE] = write_expire; dd->writes_starved = writes_starved; @@ -435,6 +461,12 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(fifo_batch), +#ifdef CONFIG_IOSCHED_DEADLINE_HIER + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), + ELV_ATTR(map_sync), +#endif __ATTR_NULL }; @@ -445,13 +477,16 @@ static struct elevator_type iosched_deadline = { .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, - .elevator_queue_empty_fn = deadline_queue_empty, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_init_fn = deadline_init_queue, .elevator_exit_fn = deadline_exit_queue, + .elevator_alloc_sched_queue_fn = deadline_alloc_deadline_queue, + .elevator_free_sched_queue_fn = deadline_free_deadline_queue, }, - +#ifdef CONFIG_IOSCHED_DEADLINE_HIER + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, +#endif .elevator_attrs = deadline_attrs, .elevator_name = "deadline", .elevator_owner = THIS_MODULE, diff --git a/block/elevator-fq.c b/block/elevator-fq.c new file mode 100644 index 0000000..66b10eb --- /dev/null +++ b/block/elevator-fq.c @@ -0,0 +1,4161 @@ +/* + * elevator fair queuing Layer. Uses B-WF2Q+ hierarchical scheduler for + * fair queuing. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include "elevator-fq.h" +#include +#include +#include +#include + +/* Values taken from cfq */ +const int elv_slice_sync = HZ / 10; +int elv_slice_async = HZ / 25; +const int elv_slice_async_rq = 2; +int elv_group_idle = HZ / 125; +static struct kmem_cache *elv_ioq_pool; + +#define ELV_SLICE_SCALE (5) +#define ELV_HW_QUEUE_MIN (5) + +#define IO_DEFAULT_GRP_WEIGHT 500 +#define IO_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +#define IO_SERVICE_TREE_INIT ((struct io_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +/* Mainly the BFQ scheduling code Follows */ + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +static void +elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr); + +#ifdef CONFIG_GROUP_IOSCHED +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, + int extract); + +static int bfq_update_next_active(struct io_sched_data *sd) +{ + struct io_group *iog; + struct io_entity *entity, *next_active; + + if (sd->active_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in may ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_active = bfq_lookup_next_entity(sd, 0); + sd->next_active = next_active; + + if (next_active != NULL) { + iog = container_of(sd, struct io_group, sched_data); + entity = iog->my_entity; + if (entity != NULL) + entity->budget = next_active->budget; + } + + return 1; +} + +static inline void bfq_check_next_active(struct io_sched_data *sd, + struct io_entity *entity) +{ + BUG_ON(sd->next_active != entity); +} + +static inline int iog_deleting(struct io_group *iog) +{ + return iog->deleting; +} + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + if (entity->sched_data == new_entity->sched_data) + return 1; + + return 0; +} + +static inline struct io_entity *parent_entity(struct io_entity *entity) +{ + return entity->parent; +} + +/* return depth at which a io entity is present in the hierarchy */ +static inline int depth_entity(struct io_entity *entity) +{ + int depth = 0; + + for_each_entity(entity) + depth++; + + return depth; +} + +static void bfq_find_matching_entity(struct io_entity **entity, + struct io_entity **new_entity) +{ + int entity_depth, new_entity_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same group i.e who have a common parent. Walk up the hierarchy of + * both entities until we find their ancestors who are siblings of + * common parent. + */ + + /* First walk up until both entities are at same depth */ + entity_depth = depth_entity(*entity); + new_entity_depth = depth_entity(*new_entity); + + while (entity_depth > new_entity_depth) { + entity_depth--; + *entity = parent_entity(*entity); + } + + while (new_entity_depth > entity_depth) { + new_entity_depth--; + *new_entity = parent_entity(*new_entity); + } + + while (!is_same_group(*entity, *new_entity)) { + *entity = parent_entity(*entity); + *new_entity = parent_entity(*new_entity); + } +} + +static inline struct io_group *io_entity_to_iog(struct io_entity *entity) +{ + struct io_group *iog = NULL; + + BUG_ON(entity == NULL); + if (entity->my_sched_data != NULL) + iog = container_of(entity, struct io_group, entity); + return iog; +} + +/* Returns parent group of io group */ +static inline struct io_group *iog_parent(struct io_group *iog) +{ + struct io_group *piog; + + if (!iog->entity.sched_data) + return NULL; + + /* + * Not following entity->parent pointer as for top level groups + * this pointer is NULL. + */ + piog = container_of(iog->entity.sched_data, struct io_group, + sched_data); + return piog; +} + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +static void io_group_path(struct io_group *iog, char *buf, int buflen) +{ + unsigned short id = iog->iocg_id; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (!id) + goto out; + + css = css_lookup(&io_subsys, id); + if (!css) + goto out; + + if (!css_tryget(css)) + goto out; + + cgroup_path(css->cgroup, buf, buflen); + + css_put(css); + + rcu_read_unlock(); + return; +out: + rcu_read_unlock(); + buf[0] = '\0'; + return; +} + +/* + * An entity has been freshly added to active tree. Either it came from + * idle tree or it was not on any of the trees. Do the accounting. + */ +static inline void bfq_account_for_entity_addition(struct io_entity *entity) +{ + struct io_group *iog = io_entity_to_iog(entity); + + if (iog) { + struct elv_fq_data *efqd; + + /* + * Keep track of how many times a group has been added + * to active tree. + */ + iog->queue++; + iog->queue_start = jiffies; + + /* Log group addition event */ + rcu_read_lock(); + efqd = rcu_dereference(iog->key); + if (efqd) + elv_log_iog(efqd, iog, "add group weight=%u", + iog->entity.weight); + rcu_read_unlock(); + } +} + +/* + * An entity got removed from active tree and either went to idle tree or + * not is on any of the tree. Do the accouting + */ +static inline void bfq_account_for_entity_deletion(struct io_entity *entity) +{ + struct io_group *iog = io_entity_to_iog(entity); + + if (iog) { + struct elv_fq_data *efqd; + + iog->dequeue++; + /* Keep a track of how long group was on active tree */ + iog->queue_duration += jiffies_to_msecs(jiffies - + iog->queue_start); + iog->queue_start = 0; + + /* Log group deletion event */ + rcu_read_lock(); + efqd = rcu_dereference(iog->key); + if (efqd) + elv_log_iog(efqd, iog, "del group weight=%u", + iog->entity.weight); + rcu_read_unlock(); + } +} +#endif /* DEBUG_GROUP_IOSCHED */ +#else /* GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_active(struct io_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_active(struct io_sched_data *sd, + struct io_entity *entity) +{ +} + +static inline int iog_deleting(struct io_group *iog) +{ + /* In flat mode, root cgroup can't be deleted. */ + return 0; +} + +static void bfq_find_matching_entity(struct io_entity **entity, + struct io_entity **new_entity) +{ +} + +static inline int +is_same_group(struct io_entity *entity, struct io_entity *new_entity) +{ + return 1; +} + +static inline struct io_group *io_entity_to_iog(struct io_entity *entity) +{ + return NULL; +} +#endif /* GROUP_IOSCHED */ + +static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync, + unsigned short prio) +{ + const int base_slice = efqd->elv_slice[sync]; + + WARN_ON(prio >= IOPRIO_BE_NR); + + return base_slice + (base_slice/ELV_SLICE_SCALE * (4 - prio)); +} + +static inline int +elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq) +{ + return elv_prio_slice(efqd, elv_ioq_sync(ioq), ioq->entity.ioprio); +} + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor. + */ +static inline u64 bfq_delta(unsigned long service, unsigned int weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct io_entity *entity, + unsigned long service) +{ + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + bfq_delta(service, entity->weight); +} + +static inline struct io_queue *io_entity_to_ioq(struct io_entity *entity) +{ + struct io_queue *ioq = NULL; + + BUG_ON(entity == NULL); + if (entity->my_sched_data == NULL) + ioq = container_of(entity, struct io_queue, entity); + return ioq; +} + +/** + * io_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct io_entity *io_entity_of(struct rb_node *node) +{ + struct io_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct io_entity, rb_node); + + return entity; +} + +/** + * bfq_remove - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_remove(struct rb_root *root, struct io_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_remove - remove an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_remove(struct io_service_tree *st, + struct io_entity *entity) +{ + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = io_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = io_entity_of(next); + } + + bfq_remove(&st->idle, entity); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct io_entity *entity) +{ + struct io_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct io_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct io_entity *entity, + struct rb_node *node) +{ + struct io_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct io_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct io_entity *entity = rb_entry(node, struct io_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +/** + * bfq_active_insert - insert an entity in the active tree of its group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct io_service_tree *st, + struct io_entity *entity) +{ + struct rb_node *node = &entity->rb_node; + + bfq_insert(&st->active, entity); + entity->sched_data->nr_active++; + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); +} + +static void bfq_get_entity(struct io_entity *entity) +{ + struct io_queue *ioq = io_entity_to_ioq(entity); + + if (ioq) + elv_get_ioq(ioq); +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_remove - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_remove(struct io_service_tree *st, + struct io_entity *entity) +{ + struct rb_node *node; + + node = bfq_find_deepest(&entity->rb_node); + bfq_remove(&st->active, entity); + entity->sched_data->nr_active--; + + if (node != NULL) + bfq_update_active_tree(node); +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct io_service_tree *st, + struct io_entity *entity) +{ + struct io_entity *first_idle = st->first_idle; + struct io_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct io_service_tree *st, + struct io_entity *entity) +{ + struct io_queue *ioq = NULL; + + BUG_ON(!entity->on_st); + entity->on_st = 0; + st->wsum -= entity->weight; + ioq = io_entity_to_ioq(entity); + if (!ioq) + return; + elv_put_ioq(ioq); +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct io_service_tree *st, + struct io_entity *entity) +{ + bfq_idle_remove(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct io_service_tree *st) +{ + struct io_entity *first_idle = st->first_idle; + struct io_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Active tree is empty. Pull back vtime to finish time of + * last idle entity on idle tree. + * Rational seems to be that it reduces the possibility of + * vtime wraparound (bfq_gt(V-F) < 0). + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +/* + * Returns the number of active entities a particular io group has. This + * includes number of active entities on service tree as well as the active + * entity which is being served currently, if any. + */ + +static inline int elv_iog_nr_active(struct io_group *iog) +{ + struct io_sched_data *sd = &iog->sched_data; + + if (sd->active_entity) + return sd->nr_active + 1; + else + return sd->nr_active; +} + +static struct io_service_tree * +__bfq_entity_update_prio(struct io_service_tree *old_st, + struct io_entity *entity) +{ + struct io_service_tree *new_st = old_st; + struct io_queue *ioq = io_entity_to_ioq(entity); + + if (entity->ioprio_changed) { + old_st->wsum -= entity->weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->weight = entity->new_weight; + entity->ioprio_changed = 0; + + /* + * Also update the scaled budget for ioq. Group will get the + * updated budget once ioq is selected to run next. + */ + if (ioq) { + struct elv_fq_data *efqd = ioq->efqd; + entity->budget = elv_prio_to_slice(efqd, ioq); + } + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = io_entity_service_tree(entity); + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated tasks getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct io_service_tree *st) +{ + struct io_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct io_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active - find the eligible entity with the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start <= vtime) entity. The path + * on the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct io_entity *bfq_first_active_entity(struct io_service_tree *st) +{ + struct io_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct io_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct io_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct io_entity *__bfq_lookup_next_entity(struct io_service_tree *st) +{ + struct io_entity *entity; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_active entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_active value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd, + int extract) +{ + struct io_service_tree *st = sd->service_tree; + struct io_entity *entity; + int i; + + /* + * We should not call lookup when an entity is active, as doing lookup + * can result in an erroneous vtime jump. + */ + BUG_ON(sd->active_entity != NULL); + + for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) { + entity = __bfq_lookup_next_entity(st); + if (entity != NULL) { + if (extract) { + bfq_check_next_active(sd, entity); + bfq_active_remove(st, entity); + sd->active_entity = entity; + sd->next_active = NULL; + } + break; + } + } + + return entity; +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct io_entity *entity, int add_front) +{ + struct io_sched_data *sd = entity->sched_data; + struct io_service_tree *st = io_entity_service_tree(entity); + int newly_added = 0; + + if (entity == sd->active_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->active_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_active entity below it. We reuse the old + * start time. + */ + bfq_active_remove(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_remove() will + * check for that. + */ + bfq_idle_remove(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + newly_added = 1; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + newly_added = 1; + } + + st = __bfq_entity_update_prio(st, entity); + /* + * This is to emulate cfq like functionality where preemption can + * happen with-in same class, like sync queue preempting async queue + * May be this is not a very good idea from fairness point of view + * as preempting queue gains share. Keeping it for now. + * + * This feature is also used by cfq close cooperator functionlity + * where cfq selects a queue out of order to run next based on + * close cooperator. + */ + if (add_front) { + struct io_entity *next_entity; + + /* Determine the entity which will be dispatched next */ + next_entity = sd->next_active; + + if (next_entity && next_entity != entity) { + struct io_service_tree *new_st; + u64 delta; + + new_st = io_entity_service_tree(next_entity); + + /* + * At this point, both entities should belong to + * same service tree as cross service tree preemption + * is automatically taken care by algorithm + */ + BUG_ON(new_st != st); + entity->finish = next_entity->finish - 1; + delta = bfq_delta(entity->budget, entity->weight); + entity->start = entity->finish - delta; + if (bfq_gt(entity->start, st->vtime)) + entity->start = st->vtime; + } + } else { + bfq_calc_finish(entity, entity->budget); + } + bfq_active_insert(st, entity); + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + if (newly_added) + bfq_account_for_entity_addition(entity); +#endif +} + +/** + * bfq_activate_entity - activate an entity. + * @entity: the entity to activate. + */ +static void bfq_activate_entity(struct io_entity *entity, int add_front) +{ + struct io_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity, add_front); + + add_front = 0; + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the active entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + */ +static int __bfq_deactivate_entity(struct io_entity *entity, int requeue) +{ + struct io_sched_data *sd = entity->sched_data; + struct io_service_tree *st = io_entity_service_tree(entity); + int was_active = entity == sd->active_entity; + int ret = 0, active_removed = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_active && entity->tree != NULL); + + if (was_active) { + bfq_calc_finish(entity, entity->service); + sd->active_entity = NULL; + active_removed = 1; + } else if (entity->tree == &st->active) { + bfq_active_remove(st, entity); + active_removed = 1; + } else if (entity->tree == &st->idle) + bfq_idle_remove(st, entity); + else if (entity->tree != NULL) + BUG(); + if (was_active || sd->next_active == entity) + ret = bfq_update_next_active(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->active_entity == entity); + BUG_ON(sd->next_active == entity); + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + if (active_removed) + bfq_account_for_entity_deletion(entity); +#endif + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct io_entity *entity, int requeue) +{ + struct io_sched_data *sd; + struct io_group *iog, *__iog; + struct io_entity *parent; + + iog = container_of(entity->sched_data, struct io_group, sched_data); + + /* + * Hold a reference to entity's iog until we are done. This function + * travels the hierarchy and we don't want to free up the group yet + * while we are traversing the hiearchy. It is possible that this + * group's cgroup has been removed hence cgroup reference is gone. + * If this entity was active entity, then its group will not be on + * any of the trees and it will be freed up the moment queue is + * freed up in __bfq_deactivate_entity(). + * + * Hence, hold a reference, deactivate the hierarhcy of entities and + * then drop the reference which should free up the whole chain of + * groups. + */ + elv_get_iog(iog); + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * under service. + */ + break; + + if (sd->next_active != NULL) { + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + elv_put_iog(iog); + goto update; + } + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + * + * If entity's group has been marked for deletion, don't + * requeue the group in idle tree so that it can be freed. + */ + __iog = container_of(entity->sched_data, struct io_group, + sched_data); + if (!iog_deleting(__iog)) + requeue = 1; + } + + elv_put_iog(iog); + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity, 0); + + sd = entity->sched_data; + if (!bfq_update_next_active(sd)) + break; + } +} + +void entity_served(struct io_entity *entity, unsigned long served, + unsigned long nr_sectors) +{ + struct io_service_tree *st; + + for_each_entity(entity) { + st = io_entity_service_tree(entity); + entity->service += served; + entity->total_service += served; + entity->total_sector_service += nr_sectors; + BUG_ON(st->wsum == 0); + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } +} + +/** + * io_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static void io_flush_idle_tree(struct io_service_tree *st) +{ + struct io_entity *entity = st->first_idle; + + for (; entity != NULL; entity = st->first_idle) + __bfq_deactivate_entity(entity, 0); +} + +/* + * Release all the io group references to its async queues. + */ +static void +io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + elv_release_ioq(e, &iog->async_queue[i][j]); + + /* Free up async idle queue */ + elv_release_ioq(e, &iog->async_idle_queue); + +#ifdef CONFIG_GROUP_IOSCHED + /* Optimization for io schedulers having single ioq */ + if (elv_iosched_single_ioq(e)) + elv_release_ioq(e, &iog->ioq); +#endif +} + +/* Mainly hierarchical grouping code */ +#ifdef CONFIG_GROUP_IOSCHED +static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup); + +static inline void +bfq_init_entity(struct io_entity *entity, struct io_group *iog) +{ + entity->parent = iog->my_entity; + entity->sched_data = &iog->sched_data; +} + +static struct io_policy_node *policy_search_node(const struct io_cgroup *iocg, + dev_t dev); + +static void +io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog, dev_t dev) +{ + struct io_entity *entity = &iog->entity; + struct io_policy_node *pn; + unsigned long flags; + + spin_lock_irqsave(&iocg->lock, flags); + pn = policy_search_node(iocg, dev); + if (pn) { + entity->weight = pn->weight; + entity->new_weight = pn->weight; + entity->ioprio_class = pn->ioprio_class; + entity->new_ioprio_class = pn->ioprio_class; + } else { + entity->weight = iocg->weight; + entity->new_weight = iocg->weight; + entity->ioprio_class = iocg->ioprio_class; + entity->new_ioprio_class = iocg->ioprio_class; + } + spin_unlock_irqrestore(&iocg->lock, flags); + entity->ioprio_changed = 1; + entity->my_sched_data = &iog->sched_data; +} + +static void io_group_set_parent(struct io_group *iog, struct io_group *parent) +{ + struct io_entity *entity; + + BUG_ON(parent == NULL); + BUG_ON(iog == NULL); + + entity = &iog->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; + if (entity->parent) + /* Child group reference on parent group. */ + elv_get_iog(parent); +} + +struct io_cgroup io_root_cgroup = { + .weight = IO_DEFAULT_GRP_WEIGHT, + .ioprio_class = IO_DEFAULT_GRP_CLASS, +}; + +static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup) +{ + if (!cgroup) + return &io_root_cgroup; + + return container_of(cgroup_subsys_state(cgroup, io_subsys_id), + struct io_cgroup, css); +} + +struct request_list *io_group_get_request_list(struct request_queue *q, + struct bio *bio) +{ + struct io_group *iog; + + iog = io_get_io_group_bio(q, bio, 1); + BUG_ON(!iog); + return &iog->rl; +} + +/* Set io group congestion on and off thresholds */ +void elv_io_group_congestion_threshold(struct request_queue *q, + struct io_group *iog) +{ + int nr; + + nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1; + if (nr > q->nr_group_requests) + nr = q->nr_group_requests; + iog->nr_congestion_on = nr; + + nr = q->nr_group_requests - (q->nr_group_requests / 8) + - (q->nr_group_requests / 16) - 1; + if (nr < 1) + nr = 1; + iog->nr_congestion_off = nr; +} + +static inline int elv_is_iog_congested(struct request_queue *q, + struct io_group *iog, int sync) +{ + if (iog->rl.count[sync] >= iog->nr_congestion_on) + return 1; + return 0; +} + +/* Determine if io group page maps to is congested or not */ +int elv_io_group_congested(struct request_queue *q, struct page *page, int sync) +{ + struct io_group *iog; + int ret = 0; + + rcu_read_lock(); + + iog = io_get_io_group(q, page, 0); + + if (!iog) { + /* + * Either cgroup got deleted or this is first request in the + * group and associated io group object has not been created + * yet. Map it to root group. + * + * TODO: Fix the case of group not created yet. + */ + iog = q->elevator->efqd.root_group; + } + + ret = elv_is_iog_congested(q, iog, sync); + if (ret) + elv_log_iog(&q->elevator->efqd, iog, "iog congested=%d sync=%d" + " rl.count[sync]=%d nr_group_requests=%d", + ret, sync, iog->rl.count[sync], q->nr_group_requests); + rcu_read_unlock(); + return ret; +} + +/* + * Search the io_group for efqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). + */ +static struct io_group * +io_cgroup_lookup_group(struct io_cgroup *iocg, void *key) +{ + struct io_group *iog; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + __key = rcu_dereference(iog->key); + if (__key == key) + return iog; + } + + return NULL; +} + +static int io_cgroup_policy_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_policy_node *pn; + + iocg = cgroup_to_io_cgroup(cgrp); + + if (list_empty(&iocg->policy_list)) + goto out; + + seq_printf(m, "dev\tweight\tclass\n"); + + spin_lock_irq(&iocg->lock); + list_for_each_entry(pn, &iocg->policy_list, node) { + seq_printf(m, "%u:%u\t%u\t%hu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight, pn->ioprio_class); + } + spin_unlock_irq(&iocg->lock); +out: + return 0; +} + +static inline void policy_insert_node(struct io_cgroup *iocg, + struct io_policy_node *pn) +{ + list_add(&pn->node, &iocg->policy_list); +} + +/* Must be called with iocg->lock held */ +static inline void policy_delete_node(struct io_policy_node *pn) +{ + list_del(&pn->node); +} + +/* Must be called with iocg->lock held */ +static struct io_policy_node *policy_search_node(const struct io_cgroup *iocg, + dev_t dev) +{ + struct io_policy_node *pn; + + if (list_empty(&iocg->policy_list)) + return NULL; + + list_for_each_entry(pn, &iocg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; +} + +static int check_dev_num(dev_t dev) +{ + int part = 0; + struct gendisk *disk; + + disk = get_gendisk(dev, &part); + if (!disk || part) + return -ENODEV; + + return 0; +} + +static int policy_parse_and_set(char *buf, struct io_policy_node *newpn) +{ + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + int ret; + unsigned long major, minor, temp; + int i = 0; + dev_t dev; + + memset(s, 0, sizeof(s)); + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 4) + break; + } + + if (i != 3) + return -EINVAL; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + return -EINVAL; + + minor_s = s[0]; + if (!minor_s) + return -EINVAL; + + ret = strict_strtoul(major_s, 10, &major); + if (ret) + return -EINVAL; + + ret = strict_strtoul(minor_s, 10, &minor); + if (ret) + return -EINVAL; + + dev = MKDEV(major, minor); + + ret = check_dev_num(dev); + if (ret) + return ret; + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[1], 10, &temp); + if (ret || temp > WEIGHT_MAX) + return -EINVAL; + + newpn->weight = temp; + + if (s[2] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[2], 10, &temp); + if (ret || temp < IOPRIO_CLASS_RT || temp > IOPRIO_CLASS_IDLE) + return -EINVAL; + newpn->ioprio_class = temp; + + return 0; +} + +static int io_cgroup_policy_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct io_cgroup *iocg; + struct io_policy_node *newpn, *pn; + char *buf; + int ret = 0; + int keep_newpn = 0; + struct hlist_node *n; + struct io_group *iog; + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = policy_parse_and_set(buf, newpn); + if (ret) + goto free_newpn; + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto free_newpn; + } + + iocg = cgroup_to_io_cgroup(cgrp); + spin_lock_irq(&iocg->lock); + + pn = policy_search_node(iocg, newpn->dev); + if (!pn) { + if (newpn->weight != 0) { + policy_insert_node(iocg, newpn); + keep_newpn = 1; + } + goto update_io_group; + } + + if (newpn->weight == 0) { + /* weight == 0 means deleteing a policy */ + policy_delete_node(pn); + goto update_io_group; + } + + pn->weight = newpn->weight; + pn->ioprio_class = newpn->ioprio_class; + +update_io_group: + hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { + if (iog->dev == newpn->dev) { + if (newpn->weight) { + iog->entity.new_weight = newpn->weight; + iog->entity.new_ioprio_class = + newpn->ioprio_class; + /* + * iog weight and ioprio_class updating + * actually happens if ioprio_changed is set. + * So ensure ioprio_changed is not set until + * new weight and new ioprio_class are updated. + */ + smp_wmb(); + iog->entity.ioprio_changed = 1; + } else { + iog->entity.new_weight = iocg->weight; + iog->entity.new_ioprio_class = + iocg->ioprio_class; + + /* The same as above */ + smp_wmb(); + iog->entity.ioprio_changed = 1; + } + } + } + spin_unlock_irq(&iocg->lock); + + cgroup_unlock(); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct io_cgroup *iocg; \ + u64 ret; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + spin_lock_irq(&iocg->lock); \ + ret = iocg->__VAR; \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int io_cgroup_##__VAR##_write(struct cgroup *cgroup, \ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct io_cgroup *iocg; \ + struct io_group *iog; \ + struct hlist_node *n; \ + struct io_policy_node *pn; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return -EINVAL; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + iocg = cgroup_to_io_cgroup(cgroup); \ + \ + spin_lock_irq(&iocg->lock); \ + iocg->__VAR = (unsigned long)val; \ + hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { \ + pn = policy_search_node(iocg, iog->dev); \ + if (pn) \ + continue; \ + iog->entity.new_##__VAR = (unsigned long)val; \ + smp_wmb(); \ + iog->entity.ioprio_changed = 1; \ + } \ + spin_unlock_irq(&iocg->lock); \ + \ + cgroup_unlock(); \ + \ + return 0; \ +} + +STORE_FUNCTION(weight, 1, WEIGHT_MAX); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +static int io_cgroup_disk_time_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_group *iog; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + + rcu_read_lock(); + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_service); + } + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +static int io_cgroup_disk_sectors_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg; + struct io_group *iog; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + + rcu_read_lock(); + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_sector_service); + } + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +static int io_cgroup_disk_queue_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg = NULL; + struct io_group *iog = NULL; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + rcu_read_lock(); + /* Loop through all the io groups and print statistics */ + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), iog->queue, + iog->queue_duration); + } + } + rcu_read_unlock(); + cgroup_unlock(); + + return 0; +} + +static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) +{ + struct io_cgroup *iocg = NULL; + struct io_group *iog = NULL; + struct hlist_node *n; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); + spin_lock_irq(&iocg->lock); + /* Loop through all the io groups and print statistics */ + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (iog->key) { + seq_printf(m, "%u:%u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), iog->dequeue); + } + } + spin_unlock_irq(&iocg->lock); + cgroup_unlock(); + + return 0; +} +#endif + +struct cftype bfqio_files[] = { + { + .name = "policy", + .read_seq_string = io_cgroup_policy_read, + .write_string = io_cgroup_policy_write, + .max_write_len = 256, + }, + { + .name = "weight", + .read_u64 = io_cgroup_weight_read, + .write_u64 = io_cgroup_weight_write, + }, + { + .name = "ioprio_class", + .read_u64 = io_cgroup_ioprio_class_read, + .write_u64 = io_cgroup_ioprio_class_write, + }, + { + .name = "disk_time", + .read_seq_string = io_cgroup_disk_time_read, + }, + { + .name = "disk_sectors", + .read_seq_string = io_cgroup_disk_sectors_read, + }, +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + .name = "disk_queue", + .read_seq_string = io_cgroup_disk_queue_read, + }, + { + .name = "disk_dequeue", + .read_seq_string = io_cgroup_disk_dequeue_read, + }, +#endif +}; + +static int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, bfqio_files, + ARRAY_SIZE(bfqio_files)); +} + +static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys, + struct cgroup *cgroup) +{ + struct io_cgroup *iocg; + + if (cgroup->parent != NULL) { + iocg = kzalloc(sizeof(*iocg), GFP_KERNEL); + if (iocg == NULL) + return ERR_PTR(-ENOMEM); + } else + iocg = &io_root_cgroup; + + spin_lock_init(&iocg->lock); + INIT_HLIST_HEAD(&iocg->group_data); + iocg->weight = IO_DEFAULT_GRP_WEIGHT; + iocg->ioprio_class = IO_DEFAULT_GRP_CLASS; + INIT_LIST_HEAD(&iocg->policy_list); + + return &iocg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic/bfqq data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int iocg_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct task_struct *tsk) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too young or + * exiting: if it has still no ioc the ioc can't be shared, + * if the task is exiting the attach will fail anyway, no + * matter what we return here. + */ + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc != NULL) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys io_subsys = { + .name = "io", + .create = iocg_create, + .can_attach = iocg_can_attach, + .attach = iocg_attach, + .destroy = iocg_destroy, + .populate = iocg_populate, + .subsys_id = io_subsys_id, + .use_id = 1, +}; + +static inline unsigned int iog_weight(struct io_group *iog) +{ + return iog->entity.weight; +} + +/** + * io_group_chain_alloc - allocate a chain of groups. + * @efqd: queue descriptor. + * @cgroup: the leaf cgroup this chain starts from. + * + * Allocate a chain of groups starting from the one belonging to + * @cgroup up to the root cgroup. Stop if a cgroup on the chain + * to the root has already an allocated group on @efqd. + */ +static struct io_group * +io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup) +{ + struct io_cgroup *iocg; + struct io_group *iog, *leaf = NULL, *prev = NULL; + gfp_t flags = GFP_ATOMIC | __GFP_ZERO; + unsigned int major, minor; + struct backing_dev_info *bdi = &q->backing_dev_info; + + for (; cgroup != NULL; cgroup = cgroup->parent) { + iocg = cgroup_to_io_cgroup(cgroup); + + iog = io_cgroup_lookup_group(iocg, key); + if (iog != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a io_group for efqd, so we don't + * need any more allocations. + */ + break; + } + + iog = kzalloc_node(sizeof(*iog), flags, q->node); + if (!iog) + goto cleanup; + + iog->iocg_id = css_id(&iocg->css); + + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + iog->dev = MKDEV(major, minor); + + io_group_init_entity(iocg, iog, iog->dev); + iog->my_entity = &iog->entity; + + atomic_set(&iog->ref, 0); + iog->deleting = 0; + + elv_mark_iog_idle_window(iog); + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + elv_get_iog(iog); + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + io_group_path(iog, iog->path, sizeof(iog->path)); +#endif + + + blk_init_request_list(&iog->rl); + elv_io_group_congestion_threshold(q, iog); + + if (leaf == NULL) { + leaf = iog; + prev = leaf; + } else { + io_group_set_parent(prev, iog); + /* + * Build a list of allocated nodes using the efqd + * filed, that is still unused and will be initialized + * only after the node will be connected. + */ + prev->key = iog; + prev = iog; + } + } + + return leaf; + +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->key; + kfree(prev); + } + + return NULL; +} + +/** + * io_group_chain_link - link an allocatd group chain to a cgroup hierarchy. + * @efqd: the queue descriptor. + * @cgroup: the leaf cgroup to start from. + * @leaf: the leaf group (to be associated to @cgroup). + * + * Try to link a chain of groups to a cgroup hierarchy, connecting the + * nodes bottom-up, so we can be sure that when we find a cgroup in the + * hierarchy that already as a group associated to @efqd all the nodes + * in the path to the root cgroup have one too. + * + * On locking: the queue lock protects the hierarchy (there is a hierarchy + * per device) while the io_cgroup lock protects the list of groups + * belonging to the same cgroup. + */ +static void io_group_chain_link(struct request_queue *q, void *key, + struct cgroup *cgroup, + struct io_group *leaf, + struct elv_fq_data *efqd) +{ + struct io_cgroup *iocg; + struct io_group *iog, *next, *prev = NULL; + unsigned long flags; + + assert_spin_locked(q->queue_lock); + + for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { + iocg = cgroup_to_io_cgroup(cgroup); + next = leaf->key; + + iog = io_cgroup_lookup_group(iocg, key); + BUG_ON(iog != NULL); + + spin_lock_irqsave(&iocg->lock, flags); + + rcu_assign_pointer(leaf->key, key); + hlist_add_head_rcu(&leaf->group_node, &iocg->group_data); + hlist_add_head(&leaf->elv_data_node, &efqd->group_list); + + spin_unlock_irqrestore(&iocg->lock, flags); + + prev = leaf; + leaf = next; + } + + BUG_ON(cgroup == NULL && leaf != NULL); + + if (cgroup != NULL && prev != NULL) { + iocg = cgroup_to_io_cgroup(cgroup); + iog = io_cgroup_lookup_group(iocg, key); + io_group_set_parent(prev, iog); + } +} + +/** + * io_find_alloc_group - return the group associated to @efqd in @cgroup. + * @fqd: queue descriptor. + * @cgroup: cgroup being searched for. + * @create: if set to 1, create the io group if it has not been created yet. + * + * Return a group associated to @fqd in @cgroup, allocating one if + * necessary. When a group is returned all the cgroups in the path + * to the root have a group associated to @efqd. + * + * If the allocation fails, return the root group: this breaks guarantees + * but is a safe fallbak. If this loss becames a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. + * + * We allocate all the missing nodes in the path from the leaf cgroup + * to the root and we connect the nodes only after all the allocations + * have been successful. + */ +static struct io_group *io_find_alloc_group(struct request_queue *q, + struct cgroup *cgroup, struct elv_fq_data *efqd, + int create) +{ + struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup); + struct io_group *iog = NULL; + /* Note: Use efqd as key */ + void *key = efqd; + + /* + * Take a refenrece to css object. Don't want to map a bio to + * a group if it has been marked for deletion + */ + + if (!css_tryget(&iocg->css)) + return iog; + + iog = io_cgroup_lookup_group(iocg, key); + if (iog != NULL || !create) + goto end; + + iog = io_group_chain_alloc(q, key, cgroup); + if (iog != NULL) + io_group_chain_link(q, key, cgroup, iog, efqd); + +end: + css_put(&iocg->css); + return iog; +} + +/* Map a page to respective cgroup. Null return means, map it to root cgroup */ +static inline struct cgroup *get_cgroup_from_page(struct page *page) +{ + unsigned long bio_cgroup_id; + struct cgroup *cgroup; + + bio_cgroup_id = get_blkio_cgroup_id_page(page); + + if (!bio_cgroup_id) + return NULL; + + cgroup = blkio_cgroup_lookup(bio_cgroup_id); + return cgroup; +} + +struct io_group *io_get_io_group_bio(struct request_queue *q, struct bio *bio, + int create) +{ + struct page *page = NULL; + struct elv_fq_data *efqd = &q->elevator->efqd; + + /* + * Determine the group from task context. Even calls from + * blk_get_request() which don't have any bio info will be mapped + * to the task's group + */ + if (!bio) + goto sync; + + if (bio_barrier(bio)) { + /* + * Map barrier requests to root group. May be more special + * bio cases should come here + */ + return q->elevator->efqd.root_group; + } + + /* Map the sync bio to the right group using task context */ + if (elv_bio_sync(bio) && !efqd->map_sync) + goto sync; + +#ifdef CONFIG_TRACK_ASYNC_CONTEXT + /* Determine the group from info stored in page */ + page = bio_iovec_idx(bio, 0)->bv_page; + return io_get_io_group(q, page, create); +#endif + +sync: + return io_get_io_group(q, NULL, create); +} +EXPORT_SYMBOL(io_get_io_group_bio); + +/* + * Find the io group page belongs to. + * If "create" is set, io group is created if it is not already present. + * + * Note: This function should be called with queue lock held. It returns + * a pointer to io group without taking any reference. That group will + * be around as long as queue lock is not dropped (as group reclaim code + * needs to get hold of queue lock). So if somebody needs to use group + * pointer even after dropping queue lock, take a reference to the group + * before dropping queue lock. + * + * One can call it without queue lock with rcu read lock held for browsing + * through the groups. + */ +struct io_group *io_get_io_group(struct request_queue *q, struct page *page, + int create) +{ + struct cgroup *cgroup; + struct io_group *iog; + struct elv_fq_data *efqd = &q->elevator->efqd; + + if (create) + assert_spin_locked(q->queue_lock); + + rcu_read_lock(); + + if (!page) + cgroup = task_cgroup(current, io_subsys_id); + else + cgroup = get_cgroup_from_page(page); + + if (!cgroup) { + if (create) + iog = efqd->root_group; + else { + /* + * bio merge functions doing lookup don't want to + * map bio to root group by default + */ + iog = NULL; + } + goto out; + } + + iog = io_find_alloc_group(q, cgroup, efqd, create); + if (!iog) { + if (create) + iog = efqd->root_group; + else + iog = NULL; + } +out: + rcu_read_unlock(); + return iog; +} +EXPORT_SYMBOL(io_get_io_group); + +static void io_free_root_group(struct elevator_queue *e) +{ + struct io_cgroup *iocg = &io_root_cgroup; + struct elv_fq_data *efqd = &e->efqd; + struct io_group *iog = efqd->root_group; + struct io_service_tree *st; + int i; + + BUG_ON(!iog); + spin_lock_irq(&iocg->lock); + hlist_del_rcu(&iog->group_node); + spin_unlock_irq(&iocg->lock); + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + io_flush_idle_tree(st); + } + + io_put_io_group_queues(e, iog); + elv_put_iog(iog); +} + +static struct io_group *io_alloc_root_group(struct request_queue *q, + struct elevator_queue *e, void *key) +{ + struct io_group *iog; + struct io_cgroup *iocg; + int i; + + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node); + if (iog == NULL) + return NULL; + + elv_get_iog(iog); + iog->entity.parent = NULL; + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT; + + blk_init_request_list(&iog->rl); + elv_io_group_congestion_threshold(q, iog); + + iocg = &io_root_cgroup; + spin_lock_irq(&iocg->lock); + rcu_assign_pointer(iog->key, key); + hlist_add_head_rcu(&iog->group_node, &iocg->group_data); + iog->iocg_id = css_id(&iocg->css); + spin_unlock_irq(&iocg->lock); + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + io_group_path(iog, iog->path, sizeof(iog->path)); +#endif + + return iog; +} + +static void io_group_free_rcu(struct rcu_head *head) +{ + struct io_group *iog; + + iog = container_of(head, struct io_group, rcu_head); + kfree(iog); +} + +/* + * This cleanup function does the last bit of things to destroy cgroup. + * It should only get called after io_destroy_group has been invoked. + */ +static void io_group_cleanup(struct io_group *iog) +{ + struct io_service_tree *st; + struct io_entity *entity = iog->my_entity; + int i; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + BUG_ON(st->wsum != 0); + } + + BUG_ON(iog->sched_data.next_active != NULL); + BUG_ON(iog->sched_data.active_entity != NULL); + BUG_ON(entity != NULL && entity->tree != NULL); + + /* + * Wait for any rcu readers to exit before freeing up the group. + * Primarily useful when io_get_io_group() is called without queue + * lock to access some group data from bdi_congested_group() path. + */ + call_rcu(&iog->rcu_head, io_group_free_rcu); +} + +void elv_put_iog(struct io_group *iog) +{ + struct io_group *parent = NULL; + struct io_entity *entity; + + BUG_ON(!iog); + + entity = iog->my_entity; + + BUG_ON(atomic_read(&iog->ref) <= 0); + if (!atomic_dec_and_test(&iog->ref)) + return; + + if (entity) + parent = container_of(iog->my_entity->parent, + struct io_group, entity); + + io_group_cleanup(iog); + + if (parent) + elv_put_iog(parent); +} +EXPORT_SYMBOL(elv_put_iog); + +/* + * check whether a given group has got any active entities on any of the + * service tree. + */ +static inline int io_group_has_active_entities(struct io_group *iog) +{ + int i; + struct io_service_tree *st; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + if (!RB_EMPTY_ROOT(&st->active)) + return 1; + } + + /* + * Also check there are no active entities being served which are + * not on active tree + */ + + if (iog->sched_data.active_entity) + return 1; + + return 0; +} + +/* + * After the group is destroyed, no new sync IO should come to the group. + * It might still have pending IOs in some busy queues. It should be able to + * send those IOs down to the disk. The async IOs (due to dirty page writeback) + * would go in the root group queues after this, as the group does not exist + * anymore. + */ +static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog) +{ + struct elevator_queue *eq; + struct io_service_tree *st; + int i; + + BUG_ON(iog->my_entity == NULL); + + /* + * Mark io group for deletion so that no new entry goes in + * idle tree. Any active queue will be removed from active + * tree and not put in to idle tree. + */ + iog->deleting = 1; + + /* We flush idle tree now, and don't put things in there any more. */ + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + + io_flush_idle_tree(st); + } + + eq = container_of(efqd, struct elevator_queue, efqd); + hlist_del(&iog->elv_data_node); + io_put_io_group_queues(eq, iog); + + /* + * We can come here either through cgroup deletion path or through + * elevator exit path. If we come here through cgroup deletion path + * check if io group has any active entities or not. If not, then + * deactivate this io group to make sure it is removed from idle + * tree it might have been on. If this group was on idle tree, then + * this probably will be the last reference and group will be + * freed upon putting the reference down. + */ + + if (!io_group_has_active_entities(iog)) { + /* + * io group does not have any active entites. Because this + * group has been decoupled from io_cgroup list and this + * cgroup is being deleted, this group should not receive + * any new IO. Hence it should be safe to deactivate this + * io group and remove from the scheduling tree. + */ + __bfq_deactivate_entity(iog->my_entity, 0); + } + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, cgroup can be destroyed. + */ + elv_put_iog(iog); +} + +static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup); + struct io_group *iog; + struct elv_fq_data *efqd; + unsigned long uninitialized_var(flags); + struct io_policy_node *pn, *pntmp; + + /* + * io groups are linked in two lists. One list is maintained + * in elevator (efqd->group_list) and other is maintained + * per cgroup structure (iocg->group_data). + * + * While a cgroup is being deleted, elevator also might be + * exiting and both might try to cleanup the same io group + * so need to be little careful. + * + * (iocg->group_data) is protected by iocg->lock. To avoid deadlock, + * we can't hold the queue lock while holding iocg->lock. So we first + * remove iog from iocg->group_data under iocg->lock. Whoever removes + * iog from iocg->group_data should call __io_destroy_group to remove + * iog. + */ + + rcu_read_lock(); + +remove_entry: + spin_lock_irqsave(&iocg->lock, flags); + + if (hlist_empty(&iocg->group_data)) { + spin_unlock_irqrestore(&iocg->lock, flags); + goto done; + } + iog = hlist_entry(iocg->group_data.first, struct io_group, + group_node); + efqd = rcu_dereference(iog->key); + hlist_del_rcu(&iog->group_node); + iog->iocg_id = 0; + spin_unlock_irqrestore(&iocg->lock, flags); + + spin_lock_irqsave(efqd->queue->queue_lock, flags); + __io_destroy_group(efqd, iog); + spin_unlock_irqrestore(efqd->queue->queue_lock, flags); + goto remove_entry; + +done: + list_for_each_entry_safe(pn, pntmp, &iocg->policy_list, node) { + policy_delete_node(pn); + kfree(pn); + } + + free_css_id(&io_subsys, &iocg->css); + rcu_read_unlock(); + BUG_ON(!hlist_empty(&iocg->group_data)); + kfree(iocg); +} + +/* + * This functions checks if iog is still in iocg->group_data, and removes it. + * If iog is not in that list, then cgroup destroy path has removed it, and + * we do not need to remove it. + */ +static void io_group_check_and_destroy(struct elv_fq_data *efqd, + struct io_group *iog) +{ + struct io_cgroup *iocg; + unsigned long flags; + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + css = css_lookup(&io_subsys, iog->iocg_id); + + if (!css) + goto out; + + iocg = container_of(css, struct io_cgroup, css); + + spin_lock_irqsave(&iocg->lock, flags); + + if (iog->iocg_id) { + hlist_del_rcu(&iog->group_node); + __io_destroy_group(efqd, iog); + } + + spin_unlock_irqrestore(&iocg->lock, flags); +out: + rcu_read_unlock(); +} + +static void io_disconnect_groups(struct elevator_queue *e) +{ + struct hlist_node *pos, *n; + struct io_group *iog; + struct elv_fq_data *efqd = &e->efqd; + + hlist_for_each_entry_safe(iog, pos, n, &efqd->group_list, + elv_data_node) { + io_group_check_and_destroy(efqd, iog); + } +} + +/* + * if bio sumbmitting task and rq don't belong to same io_group, it can't + * be merged + */ +int io_group_allow_merge(struct request *rq, struct bio *bio) +{ + struct request_queue *q = rq->q; + struct io_queue *ioq = rq->ioq; + struct io_group *iog, *__iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return 1; + + /* Determine the io group of the bio submitting task */ + iog = io_get_io_group_bio(q, bio, 0); + if (!iog) { + /* May be task belongs to a differet cgroup for which io + * group has not been setup yet. */ + return 0; + } + + /* Determine the io group of the ioq, rq belongs to*/ + __iog = ioq_to_io_group(ioq); + + return (iog == __iog); +} + +/* + * Find/Create the io queue the rq should go in. This is an optimization + * for the io schedulers (noop, deadline and AS) which maintain only single + * io queue per cgroup. In this case common layer can just maintain a + * pointer in group data structure and keeps track of it. + * + * For the io schdulers like cfq, which maintain multiple io queues per + * cgroup, and decide the io queue of request based on process, this + * function is not invoked. + */ +int elv_fq_set_request_ioq(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct elevator_queue *e = q->elevator; + unsigned long flags; + struct io_queue *ioq = NULL, *new_ioq = NULL; + struct io_group *iog; + void *sched_q = NULL, *new_sched_q = NULL; + + if (!elv_iosched_fair_queuing_enabled(e)) + return 0; + + might_sleep_if(gfp_mask & __GFP_WAIT); + spin_lock_irqsave(q->queue_lock, flags); + +retry: + /* Determine the io group request belongs to */ + iog = io_get_io_group_bio(q, bio, 1); + BUG_ON(!iog); + + /* Get the iosched queue */ + ioq = iog->ioq; + if (!ioq) { + /* io queue and sched_queue needs to be allocated */ + BUG_ON(!e->ops->elevator_alloc_sched_queue_fn); + + if (new_ioq) { + goto alloc_sched_q; + } else if (gfp_mask & __GFP_WAIT) { + /* + * Inform the allocator of the fact that we will + * just repeat this allocation if it fails, to allow + * the allocator to do whatever it needs to attempt to + * free memory. + */ + spin_unlock_irq(q->queue_lock); + new_ioq = elv_alloc_ioq(q, gfp_mask | __GFP_NOFAIL + | __GFP_ZERO); + spin_lock_irq(q->queue_lock); + goto retry; + } else { + ioq = elv_alloc_ioq(q, gfp_mask | __GFP_ZERO); + if (!ioq) + goto queue_fail; + } + +alloc_sched_q: + if (new_sched_q) { + ioq = new_ioq; + new_ioq = NULL; + sched_q = new_sched_q; + new_sched_q = NULL; + } else if (gfp_mask & __GFP_WAIT) { + /* + * Inform the allocator of the fact that we will + * just repeat this allocation if it fails, to allow + * the allocator to do whatever it needs to attempt to + * free memory. + */ + spin_unlock_irq(q->queue_lock); + /* Call io scheduer to create scheduler queue */ + new_sched_q = e->ops->elevator_alloc_sched_queue_fn(q, + e, gfp_mask | __GFP_NOFAIL + | __GFP_ZERO, new_ioq); + spin_lock_irq(q->queue_lock); + goto retry; + } else { + sched_q = e->ops->elevator_alloc_sched_queue_fn(q, e, + gfp_mask | __GFP_ZERO, ioq); + if (!sched_q) { + elv_free_ioq(ioq); + goto queue_fail; + } + } + + elv_init_ioq(e, ioq, current->pid, 1); + elv_init_ioq_io_group(e, ioq, iog); + elv_init_ioq_prio_data(e, ioq, IOPRIO_CLASS_BE, IOPRIO_NORM); + elv_init_ioq_sched_queue(e, ioq, sched_q); + + io_group_set_ioq(iog, ioq); + elv_mark_ioq_sync(ioq); + elv_get_iog(iog); + } + + if (new_sched_q) + e->ops->elevator_free_sched_queue_fn(q->elevator, new_sched_q); + + if (new_ioq) + elv_free_ioq(new_ioq); + + /* Request reference */ + elv_get_ioq(ioq); + rq->ioq = ioq; + spin_unlock_irqrestore(q->queue_lock, flags); + return 0; + +queue_fail: + WARN_ON((gfp_mask & __GFP_WAIT) && !ioq); + elv_schedule_dispatch(q); + spin_unlock_irqrestore(q->queue_lock, flags); + return 1; +} + +/* + * Find out the io queue of bio belongs to. Optimization for single ioq + * per io group io schedulers. + */ +struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, struct bio *bio) +{ + struct io_group *iog; + + /* Determine the io group and io queue of the bio submitting task */ + iog = io_get_io_group_bio(q, bio, 0); + if (!iog) { + /* May be bio belongs to a cgroup for which io group has + * not been setup yet. */ + return NULL; + } + return iog->ioq; +} + +/* + * This request has been serviced. Clean up ioq info and drop the reference. + * Again this is called only for single queue per cgroup schedulers (noop, + * deadline, AS). + */ +void elv_fq_unset_request_ioq(struct request_queue *q, struct request *rq) +{ + struct io_queue *ioq = rq->ioq; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + if (ioq) { + rq->ioq = NULL; + elv_put_ioq(ioq); + } +} + +static inline int is_only_root_group(void) +{ + if (list_empty(&io_root_cgroup.css.cgroup->children)) + return 1; + + return 0; +} + +#else /* GROUP_IOSCHED */ +static inline void +bfq_init_entity(struct io_entity *entity, struct io_group *iog) +{ + entity->sched_data = &iog->sched_data; +} + +static inline void io_disconnect_groups(struct elevator_queue *e) {} +static inline unsigned int iog_weight(struct io_group *iog) { return 0; } + +static struct io_group *io_alloc_root_group(struct request_queue *q, + struct elevator_queue *e, void *key) +{ + struct io_group *iog; + int i; + + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node); + if (iog == NULL) + return NULL; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) + iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT; + + return iog; +} + +static void io_free_root_group(struct elevator_queue *e) +{ + struct io_group *iog = e->efqd.root_group; + struct io_service_tree *st; + int i; + + for (i = 0; i < IO_IOPRIO_CLASSES; i++) { + st = iog->sched_data.service_tree + i; + io_flush_idle_tree(st); + } + + io_put_io_group_queues(e, iog); + kfree(iog); +} + +struct io_group *io_get_io_group(struct request_queue *q, struct page *page, + int create) +{ + /* In flat mode, there is only root group */ + return q->elevator->efqd.root_group; +} +EXPORT_SYMBOL(io_get_io_group); + +struct io_group *io_get_io_group_bio(struct request_queue *q, struct bio *bio, + int create) +{ + return q->elevator->efqd.root_group; +} +EXPORT_SYMBOL(io_get_io_group_bio); + +static inline int is_only_root_group(void) +{ + return 1; +} +#endif /* GROUP_IOSCHED */ + +/* Elevator fair queuing function */ +static inline struct io_queue *elv_active_ioq(struct elevator_queue *e) +{ + return e->efqd.active_queue; +} + +void *elv_active_sched_queue(struct elevator_queue *e) +{ + return ioq_sched_queue(elv_active_ioq(e)); +} +EXPORT_SYMBOL(elv_active_sched_queue); + +int elv_rq_in_driver(struct elevator_queue *e) +{ + return e->efqd.rq_in_driver; +} +EXPORT_SYMBOL(elv_rq_in_driver); + +int elv_nr_busy_ioq(struct elevator_queue *e) +{ + return e->efqd.busy_queues; +} +EXPORT_SYMBOL(elv_nr_busy_ioq); + +/* Helper functions for operating on elevator idle slice timer */ +int elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires) +{ + struct elv_fq_data *efqd = &eq->efqd; + + return mod_timer(&efqd->idle_slice_timer, expires); +} +EXPORT_SYMBOL(elv_mod_idle_slice_timer); + +int elv_del_idle_slice_timer(struct elevator_queue *eq) +{ + struct elv_fq_data *efqd = &eq->efqd; + + return del_timer(&efqd->idle_slice_timer); +} +EXPORT_SYMBOL(elv_del_idle_slice_timer); + +static void elv_ioq_served(struct io_queue *ioq, unsigned long served) +{ + entity_served(&ioq->entity, served, ioq->nr_sectors); + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + struct elv_fq_data *efqd = ioq->efqd; + struct io_group *iog = ioq_to_io_group(ioq); + elv_log_ioq(efqd, ioq, "ioq served: QSt=0x%lx QSs=0x%lx" + " QTt=0x%lx QTs=0x%lx GTt=0x%lx " + " GTs=0x%lx rq_queued=%d", + served, ioq->nr_sectors, + ioq->entity.total_service, + ioq->entity.total_sector_service, + iog->entity.total_service, + iog->entity.total_sector_service, + ioq->nr_queued); + } +#endif +} + +/* + * sysfs parts below --> + */ +static ssize_t +elv_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +elv_var_store(unsigned int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtoul(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct elv_fq_data *efqd = &e->efqd; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return elv_var_show(__data, (page)); \ +} +SHOW_FUNCTION(elv_group_idle_show, efqd->elv_group_idle, 1); +EXPORT_SYMBOL(elv_group_idle_show); +SHOW_FUNCTION(elv_slice_sync_show, efqd->elv_slice[1], 1); +EXPORT_SYMBOL(elv_slice_sync_show); +SHOW_FUNCTION(elv_slice_async_show, efqd->elv_slice[0], 1); +EXPORT_SYMBOL(elv_slice_async_show); +SHOW_FUNCTION(elv_fairness_show, efqd->fairness, 0); +EXPORT_SYMBOL(elv_fairness_show); +SHOW_FUNCTION(elv_map_sync_show, efqd->map_sync, 0); +EXPORT_SYMBOL(elv_map_sync_show); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct elv_fq_data *efqd = &e->efqd; \ + unsigned int __data; \ + int ret = elv_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(elv_group_idle_store, &efqd->elv_group_idle, 0, UINT_MAX, 1); +EXPORT_SYMBOL(elv_group_idle_store); +STORE_FUNCTION(elv_slice_sync_store, &efqd->elv_slice[1], 1, UINT_MAX, 1); +EXPORT_SYMBOL(elv_slice_sync_store); +STORE_FUNCTION(elv_slice_async_store, &efqd->elv_slice[0], 1, UINT_MAX, 1); +EXPORT_SYMBOL(elv_slice_async_store); +STORE_FUNCTION(elv_fairness_store, &efqd->fairness, 0, 1, 0); +EXPORT_SYMBOL(elv_fairness_store); +STORE_FUNCTION(elv_map_sync_store, &efqd->map_sync, 0, 1, 0); +EXPORT_SYMBOL(elv_map_sync_store); +#undef STORE_FUNCTION + +void elv_schedule_dispatch(struct request_queue *q) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + + if (elv_nr_busy_ioq(q->elevator)) { + elv_log(efqd, "schedule dispatch"); + kblockd_schedule_work(efqd->queue, &efqd->unplug_work); + } +} +EXPORT_SYMBOL(elv_schedule_dispatch); + +static void elv_kick_queue(struct work_struct *work) +{ + struct elv_fq_data *efqd = + container_of(work, struct elv_fq_data, unplug_work); + struct request_queue *q = efqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +static void elv_shutdown_timer_wq(struct elevator_queue *e) +{ + del_timer_sync(&e->efqd.idle_slice_timer); + cancel_work_sync(&e->efqd.unplug_work); +} + +static void elv_ioq_set_prio_slice(struct request_queue *q, + struct io_queue *ioq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + + ioq->slice_end = jiffies + ioq->entity.budget; + elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->entity.budget); +} + +struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask) +{ + struct io_queue *ioq = NULL; + + ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node); + return ioq; +} +EXPORT_SYMBOL(elv_alloc_ioq); + +void elv_free_ioq(struct io_queue *ioq) +{ + kmem_cache_free(elv_ioq_pool, ioq); +} +EXPORT_SYMBOL(elv_free_ioq); + +int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq, pid_t pid, + int is_sync) +{ + struct elv_fq_data *efqd = &eq->efqd; + + RB_CLEAR_NODE(&ioq->entity.rb_node); + atomic_set(&ioq->ref, 0); + ioq->efqd = efqd; + if (elv_iosched_single_ioq(eq)) + ioq->pid = 0; + else + ioq->pid = current->pid; + + return 0; +} +EXPORT_SYMBOL(elv_init_ioq); + +void elv_init_ioq_io_group(struct elevator_queue *eq, struct io_queue *ioq, + void *iog) +{ + bfq_init_entity(&ioq->entity, iog); +} +EXPORT_SYMBOL(elv_init_ioq_io_group); + +void elv_init_ioq_sched_queue(struct elevator_queue *eq, struct io_queue *ioq, + void *sched_queue) +{ + ioq->sched_queue = sched_queue; +} +EXPORT_SYMBOL(elv_init_ioq_sched_queue); + +void elv_init_ioq_prio_data(struct elevator_queue *eq, struct io_queue *ioq, + int ioprio_class, int ioprio) +{ + struct elv_fq_data *efqd = &eq->efqd; + + elv_ioq_set_ioprio_class(ioq, ioprio_class); + elv_ioq_set_ioprio(ioq, ioprio); + /* + * This is the first time ioq is being initialized. Above functions + * will set new_ioprio and new_ioprio_class. Also initialize ioprio + * and ioprio_class. + */ + ioq->entity.ioprio = ioq->entity.new_ioprio; + ioq->entity.ioprio_class = ioq->entity.new_ioprio_class; + ioq->entity.budget = elv_prio_to_slice(efqd, ioq); + ioq->entity.weight = ioq->entity.new_weight; + BUG_ON(!ioq->entity.weight); +} +EXPORT_SYMBOL(elv_init_ioq_prio_data); + +struct io_queue *elv_get_oom_ioq(struct elevator_queue *eq) +{ + return &eq->efqd.oom_ioq; +} +EXPORT_SYMBOL(elv_get_oom_ioq); + +void elv_put_ioq(struct io_queue *ioq) +{ + struct elv_fq_data *efqd = ioq->efqd; + struct elevator_queue *e = container_of(efqd, struct elevator_queue, + efqd); + struct io_group *iog; + + BUG_ON(atomic_read(&ioq->ref) <= 0); + if (!atomic_dec_and_test(&ioq->ref)) + return; + + iog = ioq_to_io_group(ioq); + + BUG_ON(ioq->nr_queued); + BUG_ON(ioq->entity.tree != NULL); + BUG_ON(elv_ioq_busy(ioq)); + BUG_ON(efqd->active_queue == ioq); + + /* Can be called by outgoing elevator. Don't use q */ + BUG_ON(!e->ops->elevator_free_sched_queue_fn); + + e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue); + elv_log_ioq(efqd, ioq, "put_queue"); + elv_free_ioq(ioq); + elv_put_iog(iog); +} +EXPORT_SYMBOL(elv_put_ioq); + +static void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr) +{ + struct io_queue *ioq = *ioq_ptr; + + if (ioq != NULL) { + /* Drop the reference taken by the io group */ + elv_put_ioq(ioq); + *ioq_ptr = NULL; + } +} + +static void elv_activate_ioq(struct io_queue *ioq, int add_front) +{ + bfq_activate_entity(&ioq->entity, add_front); +} + +static void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq, + int requeue) +{ + requeue = update_requeue(ioq, requeue); + bfq_deactivate_entity(&ioq->entity, requeue); +} + +/* + * Normally next io queue to be served is selected from the service tree. + * This function allows one to choose a specific io queue to run next + * out of order. This is primarily to accomodate the close_cooperator + * feature of cfq. + * + */ +static void elv_set_next_ioq(struct request_queue *q, struct io_queue *ioq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + + BUG_ON(efqd->active_queue != NULL); + + /* + * This ioq is already on active tree. Just reactivate it back with + * add_front = 1. This will make sure that this ioq is put at the + * front of this group's service tree and will be selected to run + * next. + */ + elv_activate_ioq(ioq, 1); + elv_log_ioq(efqd, ioq, "set_next_ioq"); +} + +/* Get next queue for service. */ +static struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_entity *entity = NULL; + struct io_queue *ioq = NULL; + struct io_sched_data *sd; + + /* + * We should not call lookup when an entity is active, as doing + * lookup can result in an erroneous vtime jump. + */ + BUG_ON(efqd->active_queue != NULL); + + if (!efqd->busy_queues) + return NULL; + + sd = &efqd->root_group->sched_data; + + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1); + /* + * entity can be null despite the fact that there are busy + * queues. if all the busy queues are under a group which is + * currently under service. + * So if we are just looking for next ioq while something is + * being served, null entity is not an error. + */ + BUG_ON(!entity && extract); + + if (extract) + entity->service = 0; + + if (!entity) + return NULL; + } + + ioq = io_entity_to_ioq(entity); + + return ioq; +} + +/* + * coop (cooperating queue) tells that io scheduler selected a queue for us + * and we did not select the next queue based on fairness. + */ +static void __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq, + int coop) +{ + struct request_queue *q = efqd->queue; + + if (ioq) { + struct io_group *iog = ioq_to_io_group(ioq); + elv_log_ioq(efqd, ioq, "set_active, busy=%d ioprio=%d" + " weight=%u rq_queued=%d group_weight=%u", + efqd->busy_queues, + ioq->entity.ioprio, ioq->entity.weight, + ioq->nr_queued, iog_weight(iog)); +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + int nr_active = 0; + struct io_group *parent = NULL; + + parent = iog_parent(iog); + if (parent) + nr_active = elv_iog_nr_active(parent); + + elv_log_ioq(efqd, ioq, "set_active, ioq" + " nrgrps=%d QTt=0x%lx QTs=0x%lx GTt=0x%lx " + " GTs=0x%lx rq_queued=%d", nr_active, + ioq->entity.total_service, + ioq->entity.total_sector_service, + iog->entity.total_service, + iog->entity.total_sector_service, + ioq->nr_queued); + } +#endif + ioq->slice_end = 0; + ioq->slice_start = jiffies; + + elv_clear_ioq_wait_request(ioq); + elv_clear_iog_wait_request(iog); + elv_clear_ioq_must_dispatch(ioq); + elv_clear_iog_wait_busy_done(iog); + elv_mark_ioq_slice_new(ioq); + elv_clear_ioq_must_expire(ioq); + + del_timer(&efqd->idle_slice_timer); + } + + efqd->active_queue = ioq; + + /* Let iosched know if it wants to take some action */ + if (ioq) { + if (q->elevator->ops->elevator_active_ioq_set_fn) + q->elevator->ops->elevator_active_ioq_set_fn(q, + ioq->sched_queue, coop); + } +} + +/* Get and set a new active queue for service. */ +static struct io_queue *elv_set_active_ioq(struct request_queue *q, + struct io_queue *ioq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + int coop = 0; + + if (ioq) { + elv_set_next_ioq(q, ioq); + /* + * io scheduler selected the next queue for us. Pass this + * this info back to io scheudler. cfq currently uses it + * to reset coop flag on the queue. + */ + coop = 1; + } + + ioq = elv_get_next_ioq(q, 1); + __elv_set_active_ioq(efqd, ioq, coop); + return ioq; +} + +static void elv_reset_active_ioq(struct elv_fq_data *efqd) +{ + struct request_queue *q = efqd->queue; + struct io_queue *ioq = elv_active_ioq(efqd->queue->elevator); + + if (q->elevator->ops->elevator_active_ioq_reset_fn) + q->elevator->ops->elevator_active_ioq_reset_fn(q, + ioq->sched_queue); + efqd->active_queue = NULL; + del_timer(&efqd->idle_slice_timer); +} + +/* Called when an inactive queue receives a new request. */ +static void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq) +{ + BUG_ON(elv_ioq_busy(ioq)); + BUG_ON(ioq == efqd->active_queue); + elv_activate_ioq(ioq, 0); + elv_mark_ioq_busy(ioq); + efqd->busy_queues++; +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + struct io_group *iog = ioq_to_io_group(ioq); + elv_log_ioq(efqd, ioq, "add to busy: QTt=0x%lx QTs=0x%lx" + " GTt=0x%lx GTs=0x%lx rq_queued=%d", + ioq->entity.total_service, + ioq->entity.total_sector_service, + iog->entity.total_service, + iog->entity.total_sector_service, + ioq->nr_queued); + } +#else + elv_log_ioq(efqd, ioq, "add to busy"); +#endif +} + +static void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq, + int requeue) +{ + struct elv_fq_data *efqd = &e->efqd; + + BUG_ON(!elv_ioq_busy(ioq)); + BUG_ON(ioq->nr_queued); +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + struct io_group *iog = ioq_to_io_group(ioq); + elv_log_ioq(efqd, ioq, "del from busy: QTt=0x%lx " + "QTs=0x%lx ioq GTt=0x%lx GTs=0x%lx " + "rq_queued=%d", + ioq->entity.total_service, + ioq->entity.total_sector_service, + iog->entity.total_service, + iog->entity.total_sector_service, + ioq->nr_queued); + } +#else + elv_log_ioq(efqd, ioq, "del from busy"); +#endif + elv_clear_ioq_busy(ioq); + BUG_ON(efqd->busy_queues == 0); + efqd->busy_queues--; + elv_deactivate_ioq(efqd, ioq, requeue); +} + +/* + * Call iosched to let that elevator wants to expire the queue. This gives + * iosched like AS to say no (if it is in the middle of batch changeover or + * it is anticipating). it also allows iosched to do some house keeping + * + * force--> it is force dispatch and iosched must clean up its state. This + * is useful when elevator wants to drain iosched and wants to + * expire currnent active queue. + * + * slice_expired--> if 1, ioq slice expired hence elevator fair queuing logic + * wants to switch the queue. iosched should allow that until + * and unless necessary. Currently AS can deny the switch if + * in the middle of batch switch. + * + * if 0, time slice is still remaining. It is up to the iosched + * whether it wants to wait on this queue or just want to + * expire it and move on to next queue. + * + */ +static int elv_iosched_expire_ioq(struct request_queue *q, int slice_expired, + int force) +{ + struct elevator_queue *e = q->elevator; + struct io_queue *ioq = elv_active_ioq(q->elevator); + int ret = 1; + + if (e->ops->elevator_expire_ioq_fn) { + ret = e->ops->elevator_expire_ioq_fn(q, ioq->sched_queue, + slice_expired, force); + /* + * AS denied expiration of queue right now. Mark that elevator + * layer has requested ioscheduler (as) to expire this queue. + * Now as will try to expire this queue as soon as it can. + * Now don't try to dispatch from this queue even if we get + * a new request and if time slice is left. Do expire it once. + */ + if (!ret) + elv_mark_ioq_must_expire(ioq); + } + + return ret; +} + +/* + * Do the accounting. Determine how much service (in terms of time slices) + * current queue used and adjust the start, finish time of queue and vtime + * of the tree accordingly. + * + * Determining the service used in terms of time is tricky in certain + * situations. Especially when underlying device supports command queuing + * and requests from multiple queues can be there at same time, then it + * is not clear which queue consumed how much of disk time. + * + * To mitigate this problem, cfq starts the time slice of the queue only + * after first request from the queue has completed. This does not work + * very well if we expire the queue before we wait for first and more + * request to finish from the queue. For seeky queues, we will expire the + * queue after dispatching few requests without waiting and start dispatching + * from next queue. + * + * Currently one should set fairness = 1 to force completion of requests + * from queue before dispatch from next queue starts. This should help in + * better time accounting at the expense of throughput. + */ +void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_entity *entity = &ioq->entity; + long slice_unused = 0, slice_used = 0, slice_overshoot = 0; + struct io_group *iog = ioq_to_io_group(ioq); + + assert_spin_locked(q->queue_lock); + elv_log_ioq(efqd, ioq, "slice expired"); + + if (elv_ioq_wait_request(ioq) || elv_iog_wait_request(iog) + || elv_iog_wait_busy(iog)) + del_timer(&efqd->idle_slice_timer); + + elv_clear_ioq_wait_request(ioq); + elv_clear_iog_wait_request(iog); + elv_clear_iog_wait_busy(iog); + elv_clear_iog_wait_busy_done(iog); + elv_clear_ioq_must_expire(ioq); + + slice_used = jiffies - ioq->slice_start; + if (!slice_used) { + slice_used = 1; + goto done; + } + + /* + * Queue got expired before even a single request completed. Use + * the time elapsed since queue was scheduled in. + */ + if (!ioq->slice_end) + goto done; + + if (time_after(ioq->slice_end, jiffies)) { + slice_unused = ioq->slice_end - jiffies; + if (slice_unused == entity->budget) { + /* + * queue got expired immediately after + * completing first request. Charge 1/2 of + * time consumed in completing first request. + */ + slice_used = (slice_used + 1)/2; + } else + slice_used = entity->budget - slice_unused; + } else { + slice_overshoot = jiffies - ioq->slice_end; + slice_used = entity->budget + slice_overshoot; + } + +done: + elv_log_ioq(efqd, ioq, "sl_start= %lx sl_end=%lx, jiffies=%lx", + ioq->slice_start, ioq->slice_end, jiffies); + elv_log_ioq(efqd, ioq, "sl_used=%ld, budget=%ld overshoot=%ld sect=%lu", + slice_used, entity->budget, slice_overshoot, + ioq->nr_sectors); + elv_ioq_served(ioq, slice_used); + + BUG_ON(ioq != efqd->active_queue); + elv_reset_active_ioq(efqd); + + /* Queue is being expired. Reset number of secotrs dispatched */ + ioq->nr_sectors = 0; + if (!ioq->nr_queued) + elv_del_ioq_busy(q->elevator, ioq, 1); + else + elv_activate_ioq(ioq, 0); +} +EXPORT_SYMBOL(__elv_ioq_slice_expired); + +/* + * Expire the ioq. + */ +void elv_ioq_slice_expired(struct request_queue *q) +{ + struct io_queue *ioq = elv_active_ioq(q->elevator); + + if (ioq) + __elv_ioq_slice_expired(q, ioq); +} + +/* + * Check if new_cfqq should preempt the currently active queue. Return 0 for + * no or if we aren't sure, a 1 will cause a preemption attempt. + */ +static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq, + struct request *rq) +{ + struct io_queue *ioq; + struct elevator_queue *eq = q->elevator; + struct io_entity *entity, *new_entity; + struct io_group *iog = NULL, *new_iog = NULL; + + /* + * Currently only CFQ has preemption logic. Other schedulers don't + * have any notion of preemption across classes or preemption with-in + * class etc. + */ + if (elv_iosched_single_ioq(eq)) + return 0; + + ioq = elv_active_ioq(eq); + + if (!ioq) + return 0; + + entity = &ioq->entity; + new_entity = &new_ioq->entity; + + /* + * In hierarchical setup, one need to traverse up the hierarchy + * till both the queues are children of same parent to make a + * decision whether to do the preemption or not. + */ + bfq_find_matching_entity(&entity, &new_entity); + + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + + if (new_entity->ioprio_class == IOPRIO_CLASS_RT + && entity->ioprio_class != IOPRIO_CLASS_RT) + return 1; + /* + * Allow an BE request to pre-empt an ongoing IDLE clas timeslice. + */ + + if (new_entity->ioprio_class == IOPRIO_CLASS_BE + && entity->ioprio_class == IOPRIO_CLASS_IDLE) + return 1; + + /* + * If both the queues belong to same group, check with io scheduler + * if it has additional criterion based on which it wants to + * preempt existing queue. + */ + iog = ioq_to_io_group(ioq); + new_iog = ioq_to_io_group(new_ioq); + + if (iog != new_iog) + return 0; + + + if (eq->ops->elevator_should_preempt_fn) + return eq->ops->elevator_should_preempt_fn(q, + ioq_sched_queue(new_ioq), rq); + + return 0; +} + +static void elv_preempt_queue(struct request_queue *q, struct io_queue *ioq) +{ + elv_log_ioq(&q->elevator->efqd, ioq, "preempt"); + if (elv_iosched_expire_ioq(q, 0, 1)) { + elv_ioq_slice_expired(q); + + /* + * Put the new queue at the front of the of the current list, + * so we know that it will be selected next. + */ + + elv_activate_ioq(ioq, 1); + ioq->slice_end = 0; + elv_mark_ioq_slice_new(ioq); + } +} + +void elv_ioq_request_add(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_queue *ioq = rq->ioq; + struct io_group *iog = ioq_to_io_group(ioq); + int group_wait = 0; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + BUG_ON(!efqd); + BUG_ON(!ioq); + ioq->nr_queued++; + elv_log_ioq(efqd, ioq, "add rq: rq_queued=%d", ioq->nr_queued); + + if (!elv_ioq_busy(ioq)) + elv_add_ioq_busy(efqd, ioq); + + if (elv_iog_wait_request(iog)) { + del_timer(&efqd->idle_slice_timer); + elv_clear_iog_wait_request(iog); + group_wait = 1; + } + + /* + * If we were waiting for a request on this group, wait is + * done. Schedule the next dispatch + */ + if (elv_iog_wait_busy(iog)) { + del_timer(&efqd->idle_slice_timer); + elv_clear_iog_wait_busy(iog); + elv_mark_iog_wait_busy_done(iog); + elv_schedule_dispatch(q); + return; + } + + if (ioq == elv_active_ioq(q->elevator)) { + /* + * Remember that we saw a request from this process, but + * don't start queuing just yet. Otherwise we risk seeing lots + * of tiny requests, because we disrupt the normal plugging + * and merging. If the request is already larger than a single + * page, let it rip immediately. For that case we assume that + * merging is already done. Ditto for a busy system that + * has other work pending, don't risk delaying until the + * idle timer unplug to continue working. + */ + if (group_wait || elv_ioq_wait_request(ioq)) { + del_timer(&efqd->idle_slice_timer); + if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || + efqd->busy_queues > 1 || !blk_queue_plugged(q)) + __blk_run_queue(q); + else + elv_mark_ioq_must_dispatch(ioq); + } + } else if (elv_should_preempt(q, ioq, rq)) { + /* + * not the active queue - expire current slice if it is + * idle and has expired it's mean thinktime or this new queue + * has some old slice time left and is of higher priority or + * this new queue is RT and the current one is BE + */ + elv_preempt_queue(q, ioq); + __blk_run_queue(q); + } else if (group_wait) { + /* + * Got a request in the group we were waiting for. Request + * does not belong to active queue and we have not decided + * to preempt the current active queue. Schedule the dispatch. + */ + elv_schedule_dispatch(q); + } +} + +static void elv_idle_slice_timer(unsigned long data) +{ + struct elv_fq_data *efqd = (struct elv_fq_data *)data; + struct io_queue *ioq; + unsigned long flags; + struct request_queue *q = efqd->queue; + + elv_log(efqd, "idle timer fired"); + + spin_lock_irqsave(q->queue_lock, flags); + + ioq = efqd->active_queue; + + if (ioq) { + struct io_group *iog = ioq_to_io_group(ioq); + + elv_clear_iog_wait_request(iog); + + if (elv_iog_wait_busy(iog)) { + elv_clear_iog_wait_busy(iog); + goto expire; + } + + /* + * We saw a request before the queue expired, let it through + */ + if (elv_ioq_must_dispatch(ioq)) + goto out_kick; + + /* + * expired + */ + if (elv_ioq_slice_used(ioq)) + goto expire; + + /* + * only expire and reinvoke request handler, if there are + * other queues with pending requests + */ + if (!elv_nr_busy_ioq(q->elevator)) + goto out_cont; + + /* + * not expired and it has a request pending, let it dispatch + */ + if (ioq->nr_queued) + goto out_kick; + } +expire: + elv_ioq_slice_expired(q); +out_kick: + elv_schedule_dispatch(q); +out_cont: + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void elv_ioq_arm_slice_timer(struct request_queue *q) +{ + struct elevator_queue *eq = q->elevator; + struct io_queue *ioq = elv_active_ioq(eq); + + BUG_ON(!ioq); + + /* + * may be iosched got its own idling logic. In that case io + * schduler will take care of arming the timer, if need be. + */ + if (eq->ops->elevator_arm_slice_timer_fn) + eq->ops->elevator_arm_slice_timer_fn(q, ioq->sched_queue); +} + +static void elv_iog_arm_slice_timer(struct request_queue *q, + struct io_group *iog, int wait_for_busy) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + unsigned long sl; + + if (!efqd->elv_group_idle || !elv_iog_idle_window(iog)) + return; + /* + * This queue has consumed its time slice. We are waiting only for + * it to become busy before we select next queue for dispatch. + */ + if (wait_for_busy) { + elv_mark_iog_wait_busy(iog); + sl = efqd->elv_group_idle; + mod_timer(&efqd->idle_slice_timer, jiffies + sl); + elv_log_iog(efqd, iog, "arm idle group: %lu wait busy=1", sl); + return; + } + + elv_mark_iog_wait_request(iog); + sl = efqd->elv_group_idle; + mod_timer(&efqd->idle_slice_timer, jiffies + sl); + elv_log_iog(efqd, iog, "arm_idle group: %lu", sl); +} + +/* + * If io scheduler has functionality of keeping track of close cooperator, check + * with it if it has got a closely co-operating queue. + */ +static inline struct io_queue *elv_close_cooperator(struct request_queue *q, + struct io_queue *ioq) +{ + struct elevator_queue *e = q->elevator; + struct io_queue *new_ioq = NULL; + void *sched_queue = ioq->sched_queue; + + if (q->elevator->ops->elevator_close_cooperator_fn) + new_ioq = e->ops->elevator_close_cooperator_fn(q, sched_queue); + + /* Only select co-operating queue if it belongs to same group as ioq */ + if (new_ioq && !is_same_group(&ioq->entity, &new_ioq->entity)) + return NULL; + + if (new_ioq) + elv_log_ioq(&e->efqd, ioq, "cooperating ioq=%d", new_ioq->pid); + + return new_ioq; +} + +/* Common layer function to select the next queue to dispatch from */ +void *elv_fq_select_ioq(struct request_queue *q, int force) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_queue *new_ioq = NULL, *ioq = elv_active_ioq(q->elevator); + struct io_group *iog; + struct elevator_type *e = q->elevator->elevator_type; + int slice_expired = 1; + + if (!elv_nr_busy_ioq(q->elevator)) + return NULL; + + if (ioq == NULL) + goto new_queue; + + iog = ioq_to_io_group(ioq); + + /* + * Force dispatch. Continue to dispatch from current queue as long + * as it has requests. + */ + if (unlikely(force)) { + if (ioq->nr_queued) + goto keep_queue; + else + goto expire; + } + + /* This queue has been marked for expiry. Try to expire it */ + if (elv_ioq_must_expire(ioq)) + goto expire; + + /* + * If there is only root group present, don't expire the queue for + * single queue ioschedulers (noop, deadline, AS). It is unnecessary + * overhead. + */ + + if (is_only_root_group() && elv_iosched_single_ioq(q->elevator)) { + elv_log_ioq(efqd, ioq, "select: only root group, no expiry"); + goto keep_queue; + } + + + /* We are waiting for this group to become busy before it expires.*/ + if (elv_iog_wait_busy(iog)) { + ioq = NULL; + goto keep_queue; + } + + /* + * The active queue has run out of time, expire it and select new. + */ + if (elv_ioq_slice_used(ioq) && !elv_ioq_must_dispatch(ioq)) { + /* + * Queue has used up its slice. Wait busy is not on otherwise + * we wouldn't have been here. If this group will be deleted + * after the queue expiry, then make sure we have onece + * done wait busy on the group in an attempt to make it + * backlogged. + * + * Following check helps in two conditions. + * - If there are requests dispatched from the queue and + * select_ioq() comes before a request completed from the + * queue and got a chance to arm any of the idle timers. + * + * - If at request completion time slice had not expired and + * we armed either a ioq timer or group timer but when + * select_ioq() hits, slice has expired and it will expire + * the queue without doing busy wait on group. + * + * In similar situations cfq lets delte the queue even if + * idle timer is armed. That does not impact fairness in non + * hierarhical setup due to weighted slice lengths. But in + * hierarchical setup where group slice lengths are derived + * from queue and is not proportional to group's weight, it + * harms the fairness of the group. + */ + if ((elv_iog_nr_active(iog) <= 1) && !ioq->nr_queued + && !elv_iog_wait_busy_done(iog) && efqd->elv_group_idle + && elv_iog_idle_window(iog)) { + ioq = NULL; + goto keep_queue; + } else + goto expire; + } + + /* + * The active queue has requests and isn't expired, allow it to + * dispatch. + */ + + if (ioq->nr_queued) + goto keep_queue; + + /* + * If another queue has a request waiting within our mean seek + * distance, let it run. The expire code will check for close + * cooperators and put the close queue at the front of the service + * tree. + */ + new_ioq = elv_close_cooperator(q, ioq); + if (new_ioq) + goto expire; + + /* + * No requests pending. If the active queue still has requests in + * flight or is idling for a new request, allow either of these + * conditions to happen (or time out) before selecting a new queue. + */ + + if (timer_pending(&efqd->idle_slice_timer) || + (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) { + ioq = NULL; + goto keep_queue; + } + + /* Check for group idling */ + if (elv_iog_idle_window(iog) && (elv_iog_nr_active(iog) <= 1) + && elv_ioq_nr_dispatched(ioq)) { + ioq = NULL; + goto keep_queue; + } + + slice_expired = 0; +expire: + if (efqd->fairness && !force && ioq && ioq->dispatched + && strcmp(e->elevator_name, "anticipatory")) { + /* + * If there are request dispatched from this queue, don't + * dispatch requests from new queue till all the requests from + * this queue have completed. + * + * This helps in attributing right amount of disk time consumed + * by a particular queue when hardware allows queuing. + * + * Set ioq = NULL so that no more requests are dispatched from + * this queue. + * + * Note: Anticipatory already has the behavior where queue + * switch is not allowed until requests from previous queue + * have finished. Hence we don't have to get into this loop + * in case of AS. + */ + elv_log_ioq(efqd, ioq, "select: wait for requests to finish" + " disp=%lu", ioq->dispatched); + ioq = NULL; + goto keep_queue; + } + + if (elv_iosched_expire_ioq(q, slice_expired, force)) + elv_ioq_slice_expired(q); + else + /* + * Not making ioq = NULL, as AS can deny queue expiration and + * continue to dispatch from same queue + */ + goto keep_queue; +new_queue: + ioq = elv_set_active_ioq(q, new_ioq); +keep_queue: + if (ioq) + elv_log_ioq(efqd, ioq, "select busy=%d qued=%d disp=%d", + elv_nr_busy_ioq(q->elevator), ioq->nr_queued, + elv_ioq_nr_dispatched(ioq)); + return ioq; +} + +/* A request got removed from io_queue. Do the accounting */ +void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq) +{ + struct io_queue *ioq; + struct elv_fq_data *efqd; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + ioq = rq->ioq; + BUG_ON(!ioq); + ioq->nr_queued--; + + efqd = ioq->efqd; + BUG_ON(!efqd); + + if (elv_ioq_busy(ioq) && (elv_active_ioq(e) != ioq) && !ioq->nr_queued) + elv_del_ioq_busy(e, ioq, 1); +} + +/* A request got dispatched. Do the accounting. */ +void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq) +{ + struct io_queue *ioq = rq->ioq; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + BUG_ON(!ioq); + ioq->dispatched++; + ioq->nr_sectors += blk_rq_sectors(rq); + elv_ioq_request_removed(e, rq); + elv_clear_ioq_must_dispatch(ioq); +} + +void elv_fq_activate_rq(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + efqd->rq_in_driver++; + elv_log_ioq(efqd, rq->ioq, "activate rq, drv=%d", + efqd->rq_in_driver); +} + +void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq) +{ + struct elv_fq_data *efqd = &q->elevator->efqd; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + WARN_ON(!efqd->rq_in_driver); + efqd->rq_in_driver--; + elv_log_ioq(efqd, rq->ioq, "deactivate rq, drv=%d", + efqd->rq_in_driver); +} + +/* A request got completed from io_queue. Do the accounting. */ +void elv_ioq_completed_request(struct request_queue *q, struct request *rq) +{ + const int sync = rq_is_sync(rq); + struct io_queue *ioq; + struct elv_fq_data *efqd = &q->elevator->efqd; + struct io_group *iog; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return; + + ioq = rq->ioq; + iog = ioq_to_io_group(ioq); + WARN_ON(!efqd->rq_in_driver); + WARN_ON(!ioq->dispatched); + efqd->rq_in_driver--; + ioq->dispatched--; + + elv_log_ioq(efqd, ioq, "complete rq_queued=%d drv=%d disp=%d", + ioq->nr_queued, efqd->rq_in_driver, + elv_ioq_nr_dispatched(ioq)); + /* + * If this is the active queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + + if (elv_active_ioq(q->elevator) == ioq) { + if (elv_ioq_slice_new(ioq)) { + elv_ioq_set_prio_slice(q, ioq); + elv_clear_ioq_slice_new(ioq); + } + + if (elv_ioq_class_idle(ioq)) { + if (elv_iosched_expire_ioq(q, 1, 0)) + elv_ioq_slice_expired(q); + goto done; + } + + /* + * If there is only root group present, don't expire the queue + * for single queue ioschedulers (noop, deadline, AS). It is + * unnecessary overhead. + */ + + if (is_only_root_group() && + elv_iosched_single_ioq(q->elevator)) { + elv_log_ioq(efqd, ioq, "select: only root group," + " no expiry"); + goto done; + } + + /* + * If there are no requests waiting in this queue, and + * there are other queues ready to issue requests, AND + * those other queues are issuing requests within our + * mean seek distance, give them a chance to run instead + * of idling. + */ + if (elv_ioq_slice_used(ioq)) { + /* This is the last empty queue in the group and it + * has consumed its slice. If we expire it right away + * group might loose its share. Wait for an extra + * group_idle period for a request before queue + * expires. + */ + if ((elv_iog_nr_active(iog) <= 1) && !ioq->nr_queued) { + elv_iog_arm_slice_timer(q, iog, 1); + goto done; + } + + /* If fairness is set and there are requests + * dispatched from this queue, don't dispatch + * new requests from a different queue till + * all requests from this queue have finished. + * This helps in attributing right disk time + * to a queue when hardware supports queuing. + */ + + if (efqd->fairness && ioq->dispatched) + goto done; + + /* Expire the queue */ + if (elv_iosched_expire_ioq(q, 1, 0)) + elv_ioq_slice_expired(q); + } else if (!ioq->nr_queued && !elv_close_cooperator(q, ioq) + && sync && !rq_noidle(rq)) + elv_ioq_arm_slice_timer(q); + + /* + * If this is the last queue in the group and we did not + * decide to idle on queue, idle on group. + */ + if (elv_active_ioq(q->elevator) && !ioq->nr_queued && + !ioq->dispatched && !timer_pending(&efqd->idle_slice_timer) + && (elv_iog_nr_active(iog) <= 1)) { + /* + * If queue has used up its slice, wait for the + * one extra group_idle period to let the group + * backlogged again. This is to avoid a group loosing + * its fair share. + */ + if (elv_ioq_slice_used(ioq)) + elv_iog_arm_slice_timer(q, iog, 1); + else + elv_iog_arm_slice_timer(q, iog, 0); + } + } +done: + if (!efqd->rq_in_driver) + elv_schedule_dispatch(q); +} + +void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class, + int ioprio) +{ + struct io_queue *ioq = NULL; + + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + ioq = iog->async_queue[0][ioprio]; + break; + case IOPRIO_CLASS_BE: + ioq = iog->async_queue[1][ioprio]; + break; + case IOPRIO_CLASS_IDLE: + ioq = iog->async_idle_queue; + break; + default: + BUG(); + } + + if (ioq) + return ioq->sched_queue; + return NULL; +} +EXPORT_SYMBOL(io_group_async_queue_prio); + +void io_group_set_async_queue(struct io_group *iog, int ioprio_class, + int ioprio, struct io_queue *ioq) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + iog->async_queue[0][ioprio] = ioq; + break; + case IOPRIO_CLASS_BE: + iog->async_queue[1][ioprio] = ioq; + break; + case IOPRIO_CLASS_IDLE: + iog->async_idle_queue = ioq; + break; + default: + BUG(); + } + + /* + * Take the group reference and pin the queue. Group exit will + * clean it up + */ + elv_get_ioq(ioq); +} +EXPORT_SYMBOL(io_group_set_async_queue); + +static void elv_slab_kill(void) +{ + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ + if (elv_ioq_pool) + kmem_cache_destroy(elv_ioq_pool); +} + +static int __init elv_slab_setup(void) +{ + elv_ioq_pool = KMEM_CACHE(io_queue, 0); + if (!elv_ioq_pool) + goto fail; + + return 0; +fail: + elv_slab_kill(); + return -ENOMEM; +} + +/* Initialize fair queueing data associated with elevator */ +int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e) +{ + struct io_group *iog; + struct elv_fq_data *efqd = &e->efqd; + + if (!elv_iosched_fair_queuing_enabled(e)) + return 0; + + iog = io_alloc_root_group(q, e, efqd); + if (iog == NULL) + return 1; + + efqd->root_group = iog; + + /* + * Our fallback ioq if elv_alloc_ioq() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + elv_init_ioq(e, &efqd->oom_ioq, 1, 0); + elv_get_ioq(&efqd->oom_ioq); + elv_init_ioq_io_group(e, &efqd->oom_ioq, iog); + elv_init_ioq_prio_data(e, &efqd->oom_ioq, IOPRIO_CLASS_BE, IOPRIO_NORM); + + efqd->queue = q; + + init_timer(&efqd->idle_slice_timer); + efqd->idle_slice_timer.function = elv_idle_slice_timer; + efqd->idle_slice_timer.data = (unsigned long) efqd; + + INIT_WORK(&efqd->unplug_work, elv_kick_queue); + INIT_HLIST_HEAD(&efqd->group_list); + + efqd->elv_slice[0] = elv_slice_async; + efqd->elv_slice[1] = elv_slice_sync; + efqd->elv_group_idle = elv_group_idle; + + return 0; +} + +/* + * elv_exit_fq_data is called before we call elevator_exit_fn. Before + * we ask elevator to cleanup its queues, we do the cleanup here so + * that all the group and idle tree references to ioq are dropped. Later + * during elevator cleanup, ioc reference will be dropped which will lead + * to removal of ioscheduler queue as well as associated ioq object. + */ +void elv_exit_fq_data(struct elevator_queue *e) +{ + struct elv_fq_data *efqd = &e->efqd; + struct request_queue *q = efqd->queue; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + elv_shutdown_timer_wq(e); + + spin_lock_irq(q->queue_lock); + /* This should drop all the io group references of async queues */ + io_disconnect_groups(e); + spin_unlock_irq(q->queue_lock); + + elv_shutdown_timer_wq(e); + + /* Wait for iog->key accessors to exit their grace periods. */ + synchronize_rcu(); + + BUG_ON(timer_pending(&efqd->idle_slice_timer)); + io_free_root_group(e); +} + +/* + * This is called after the io scheduler has cleaned up its data structres. + * I don't think that this function is required. Right now just keeping it + * because cfq cleans up timer and work queue again after freeing up + * io contexts. To me io scheduler has already been drained out, and all + * the active queue have already been expired so time and work queue should + * not been activated during cleanup process. + * + * Keeping it here for the time being. Will get rid of it later. + */ +void elv_exit_fq_data_post(struct elevator_queue *e) +{ + struct elv_fq_data *efqd = &e->efqd; + + if (!elv_iosched_fair_queuing_enabled(e)) + return; + + elv_shutdown_timer_wq(e); + BUG_ON(timer_pending(&efqd->idle_slice_timer)); +} + + +static int __init elv_fq_init(void) +{ + if (elv_slab_setup()) + return -ENOMEM; + + /* could be 0 on HZ < 1000 setups */ + + if (!elv_slice_async) + elv_slice_async = 1; + + return 0; +} + +module_init(elv_fq_init); diff --git a/block/elevator-fq.h b/block/elevator-fq.h new file mode 100644 index 0000000..0581e55 --- /dev/null +++ b/block/elevator-fq.h @@ -0,0 +1,769 @@ +/* + * elevator fair queuing Layer. Uses B-WF2Q+ hierarchical scheduler for + * fair queuing. Data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include + +#ifndef _BFQ_SCHED_H +#define _BFQ_SCHED_H + +#define IO_IOPRIO_CLASSES 3 +#define WEIGHT_MAX 1000 + +struct io_entity; +struct io_queue; + +#ifdef CONFIG_ELV_FAIR_QUEUING +#define ELV_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, elv_##name##_show, elv_##name##_store) + +/** + * struct io_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * io_service_tree. All the fields are protected by the queue lock + * of the containing efqd. + */ +struct io_service_tree { + struct rb_root active; + struct rb_root idle; + + struct io_entity *first_idle; + struct io_entity *last_idle; + + u64 vtime; + unsigned int wsum; +}; + +/** + * struct io_sched_data - multi-class scheduler. + * @active_entity: entity under service. + * @next_active: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * io_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_active points to the active entity of the sched_data service + * trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing efqd. + */ +struct io_sched_data { + struct io_entity *active_entity; + struct io_entity *next_active; + int nr_active; + struct io_service_tree service_tree[IO_IOPRIO_CLASSES]; +}; + +/** + * struct io_entity - schedulable entity. + * @rb_node: service_tree member. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: the weight in use. + * @new_weight: when a weight change is requested, the new weight value + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_ioprio: when an ioprio change is requested, the new ioprio value + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested an ioprio or + * ioprio_class change. + * + * A io_entity is used to represent either a io_queue (leaf node in the + * cgroup hierarchy) or a io_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would allow + * different weights on different devices, but this functionality is not + * exported to userspace by now. Priorities are updated lazily, first + * storing the new values into the new_* fields, then setting the + * @ioprio_changed flag. As soon as there is a transition in the entity + * state that allows the priority update to take place the effective and + * the requested priority values are synchronized. + * + * The weight value is calculated from the ioprio to export the same + * interface as CFQ. + * + * All the fields are protected by the queue lock of the containing efqd. + */ +struct io_entity { + struct rb_node rb_node; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned int weight, new_weight; + + struct io_entity *parent; + + struct io_sched_data *my_sched_data; + struct io_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; + + /* + * Keep track of total service received by this entity. Keep the + * stats both for time slices and number of sectors dispatched + */ + unsigned long total_service; + unsigned long total_sector_service; +}; + +/* + * A common structure representing the io queue where requests are actually + * queued. + */ +struct io_queue { + struct io_entity entity; + atomic_t ref; + unsigned int flags; + + /* Pointer to generic elevator fair queuing data structure */ + struct elv_fq_data *efqd; + pid_t pid; + + /* Number of requests queued on this io queue */ + unsigned long nr_queued; + + /* Requests dispatched from this queue */ + int dispatched; + + /* Number of sectors dispatched in current dispatch round */ + unsigned long nr_sectors; + + unsigned long slice_end; + + /* Keeps track of when queue was scheduled in to dispatch */ + unsigned long slice_start; + + /* Pointer to io scheduler's queue */ + void *sched_queue; +}; + +#ifdef CONFIG_GROUP_IOSCHED +/** + * struct io_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both io_queues and io_groups). + * @group_node: node to be inserted into the io_cgroup->group_data + * list of the containing cgroup's io_cgroup. + * @elv_data_node: node to be inserted into the @efqd->group_list list + * of the groups active on the same device; used for cleanup. + * @async_queue: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_queue: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/migration. + * + * Each (device, cgroup) pair has its own io_group, i.e., for each cgroup + * there is a set of io_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the io_cgroup lock, and is accessed + * via RCU from its readers. + * o @efqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @efqd queue lock. + */ +struct io_group { + struct io_entity entity; + unsigned int flags; + struct hlist_node elv_data_node; + struct hlist_node group_node; + struct io_sched_data sched_data; + atomic_t ref; + struct io_entity *my_entity; + + /* + * A cgroup has multiple io_groups, one for each request queue. + * to find io group belonging to a particular queue, elv_fq_data + * pointer is stored as a key. + */ + void *key; + + /* + * async queue for each priority case for RT and BE class. + * Used only for cfq. + */ + + struct io_queue *async_queue[2][IOPRIO_BE_NR]; + struct io_queue *async_idle_queue; + struct rcu_head rcu_head; + int deleting; + unsigned short iocg_id; + + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + /* How many times this group has been added to active tree */ + unsigned long queue; + + /* How long this group remained on active tree, in ms */ + unsigned long queue_duration; + + /* When was this group added to active tree */ + unsigned long queue_start; + + /* How many times this group has been removed from active tree */ + unsigned long dequeue; + + /* Store cgroup path */ + char path[128]; +#endif + + /* Single ioq per group, used for noop, deadline, anticipatory */ + struct io_queue *ioq; + + /* io group congestion on and off threshold for request descriptors */ + unsigned int nr_congestion_on; + unsigned int nr_congestion_off; + + /* request list associated with the group */ + struct request_list rl; +}; + +struct io_policy_node { + struct list_head node; + dev_t dev; + unsigned int weight; + unsigned short ioprio_class; +}; + +/** + * struct io_cgroup - io cgroup data structure. + * @css: subsystem state for io in the containing cgroup. + * @weight: cgroup weight. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @weight, @ioprio_class and @group_data. + * @group_data: list containing the io_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @weight and @ioprio_class are protected by @lock. + */ +struct io_cgroup { + struct cgroup_subsys_state css; + + unsigned int weight; + unsigned short ioprio_class; + + /* list of io_policy_node */ + struct list_head policy_list; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct io_group { + struct io_sched_data sched_data; + + /* async_queue and idle_queue are used only for cfq */ + struct io_queue *async_queue[2][IOPRIO_BE_NR]; + struct io_queue *async_idle_queue; +}; +#endif /* CONFIG_GROUP_IOSCHED */ + + +struct elv_fq_data { + struct io_group *root_group; + + /* List of io groups hanging on this elevator */ + struct hlist_head group_list; + + struct request_queue *queue; + unsigned int busy_queues; + + /* Pointer to the ioscheduler queue being served */ + void *active_queue; + + int rq_in_driver; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + unsigned int elv_group_idle; + + /* Base slice length for sync and async queues */ + unsigned int elv_slice[2]; + + /* + * Fallback dummy ioq for extreme OOM conditions + */ + struct io_queue oom_ioq; + + /* + * If set to 1, waits for all request completions from current + * queue before new queue is scheduled in + */ + unsigned int fairness; + + /* + * Get io group bio belongs to from bio and not from submitting task + * context + */ + unsigned int map_sync; +}; + +/* Logging facilities. */ +#ifdef CONFIG_DEBUG_GROUP_IOSCHED +#define elv_log_ioq(efqd, ioq, fmt, args...) \ +{ \ + blk_add_trace_msg((efqd)->queue, "elv%d%c %s " fmt, (ioq)->pid, \ + elv_ioq_sync(ioq) ? 'S' : 'A', \ + ioq_to_io_group(ioq)->path, ##args); \ +} + +#define elv_log_iog(efqd, iog, fmt, args...) \ +{ \ + blk_add_trace_msg((efqd)->queue, "elv %s " fmt, (iog)->path, ##args); \ +} + +#else +#define elv_log_ioq(efqd, ioq, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv%d%c " fmt, (ioq)->pid, \ + elv_ioq_sync(ioq) ? 'S' : 'A', ##args) + +#define elv_log_iog(efqd, iog, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args) + +#endif + +#define elv_log(efqd, fmt, args...) \ + blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args) + +#define ioq_sample_valid(samples) ((samples) > 80) + +/* Some shared queue flag manipulation functions among elevators */ + +enum elv_queue_state_flags { + ELV_QUEUE_FLAG_busy = 0, /* has requests or is under service */ + ELV_QUEUE_FLAG_sync, /* synchronous queue */ + ELV_QUEUE_FLAG_idle_window, /* elevator slice idling enabled */ + ELV_QUEUE_FLAG_wait_request, /* waiting for a request */ + ELV_QUEUE_FLAG_must_dispatch, /* must be allowed a dispatch */ + ELV_QUEUE_FLAG_slice_new, /* no requests dispatched in slice */ + ELV_QUEUE_FLAG_must_expire, /* Expire this queue even if it has + * request and time slice left */ +}; + +#define ELV_IO_QUEUE_FLAG_FNS(name) \ +static inline void elv_mark_ioq_##name(struct io_queue *ioq) \ +{ \ + (ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name); \ +} \ +static inline void elv_clear_ioq_##name(struct io_queue *ioq) \ +{ \ + (ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name); \ +} \ +static inline int elv_ioq_##name(struct io_queue *ioq) \ +{ \ + return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0; \ +} + +ELV_IO_QUEUE_FLAG_FNS(busy) +ELV_IO_QUEUE_FLAG_FNS(sync) +ELV_IO_QUEUE_FLAG_FNS(wait_request) +ELV_IO_QUEUE_FLAG_FNS(must_dispatch) +ELV_IO_QUEUE_FLAG_FNS(idle_window) +ELV_IO_QUEUE_FLAG_FNS(slice_new) +ELV_IO_QUEUE_FLAG_FNS(must_expire) + +#ifdef CONFIG_GROUP_IOSCHED + +enum elv_group_state_flags { + ELV_GROUP_FLAG_idle_window, /* elevator group idling enabled */ + ELV_GROUP_FLAG_wait_request, /* waiting for a request */ + ELV_GROUP_FLAG_wait_busy, /* wait for this queue to get busy */ + ELV_GROUP_FLAG_wait_busy_done, /* Have already waited on this group*/ +}; + +#define ELV_IO_GROUP_FLAG_FNS(name) \ +static inline void elv_mark_iog_##name(struct io_group *iog) \ +{ \ + (iog)->flags |= (1 << ELV_GROUP_FLAG_##name); \ +} \ +static inline void elv_clear_iog_##name(struct io_group *iog) \ +{ \ + (iog)->flags &= ~(1 << ELV_GROUP_FLAG_##name); \ +} \ +static inline int elv_iog_##name(struct io_group *iog) \ +{ \ + return ((iog)->flags & (1 << ELV_GROUP_FLAG_##name)) != 0; \ +} + +#else /* GROUP_IOSCHED */ + +#define ELV_IO_GROUP_FLAG_FNS(name) \ +static inline void elv_mark_iog_##name(struct io_group *iog) {} \ +static inline void elv_clear_iog_##name(struct io_group *iog) {} \ +static inline int elv_iog_##name(struct io_group *iog) { return 0; } +#endif /* GROUP_IOSCHED */ + +ELV_IO_GROUP_FLAG_FNS(idle_window) +ELV_IO_GROUP_FLAG_FNS(wait_request) +ELV_IO_GROUP_FLAG_FNS(wait_busy) +ELV_IO_GROUP_FLAG_FNS(wait_busy_done) + +static inline struct io_service_tree * +io_entity_service_tree(struct io_entity *entity) +{ + struct io_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= IO_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline int elv_ioq_slice_used(struct io_queue *ioq) +{ + if (elv_ioq_slice_new(ioq)) + return 0; + if (time_before(jiffies, ioq->slice_end)) + return 0; + + return 1; +} + +/* How many request are currently dispatched from the queue */ +static inline int elv_ioq_nr_dispatched(struct io_queue *ioq) +{ + return ioq->dispatched; +} + +/* How many request are currently queued in the queue */ +static inline int elv_ioq_nr_queued(struct io_queue *ioq) +{ + return ioq->nr_queued; +} + +static inline void elv_get_ioq(struct io_queue *ioq) +{ + atomic_inc(&ioq->ref); +} + +static inline int elv_ioq_class_idle(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE; +} + +static inline int elv_ioq_class_rt(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class == IOPRIO_CLASS_RT; +} + +static inline int elv_ioq_ioprio_class(struct io_queue *ioq) +{ + return ioq->entity.ioprio_class; +} + +static inline int elv_ioq_ioprio(struct io_queue *ioq) +{ + return ioq->entity.ioprio; +} + +static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq, + int ioprio_class) +{ + ioq->entity.new_ioprio_class = ioprio_class; + ioq->entity.ioprio_changed = 1; +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static inline unsigned int bfq_ioprio_to_weight(int ioprio) +{ + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return ((IOPRIO_BE_NR - ioprio) * WEIGHT_MAX)/IOPRIO_BE_NR; +} + +static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio) +{ + ioq->entity.new_ioprio = ioprio; + ioq->entity.new_weight = bfq_ioprio_to_weight(ioprio); + ioq->entity.ioprio_changed = 1; +} + +static inline void *ioq_sched_queue(struct io_queue *ioq) +{ + if (ioq) + return ioq->sched_queue; + return NULL; +} + +static inline struct io_group *ioq_to_io_group(struct io_queue *ioq) +{ + return container_of(ioq->entity.sched_data, struct io_group, + sched_data); +} + +#ifdef CONFIG_GROUP_IOSCHED +extern int io_group_allow_merge(struct request *rq, struct bio *bio); +extern void elv_put_iog(struct io_group *iog); +extern ssize_t elv_group_idle_show(struct elevator_queue *q, char *name); +extern ssize_t elv_group_idle_store(struct elevator_queue *q, const char *name, + size_t count); +extern ssize_t elv_map_sync_show(struct elevator_queue *q, char *name); +extern ssize_t elv_map_sync_store(struct elevator_queue *q, const char *name, + size_t count); +static inline void elv_get_iog(struct io_group *iog) +{ + atomic_inc(&iog->ref); +} + +static inline int update_requeue(struct io_queue *ioq, int requeue) +{ + struct io_group *iog = ioq_to_io_group(ioq); + + if (iog->deleting == 1) + return 0; + + return requeue; +} + +extern int elv_fq_set_request_ioq(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask); +extern void elv_fq_unset_request_ioq(struct request_queue *q, + struct request *rq); +extern struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, + struct bio *bio); +extern struct request_list *io_group_get_request_list(struct request_queue *q, + struct bio *bio); +extern int elv_io_group_congested(struct request_queue *q, struct page *page, + int sync); + +/* Sets the single ioq associated with the io group. (noop, deadline, AS) */ +static inline void io_group_set_ioq(struct io_group *iog, struct io_queue *ioq) +{ + BUG_ON(!iog); + /* io group reference. Will be dropped when group is destroyed. */ + elv_get_ioq(ioq); + iog->ioq = ioq; +} + +#else /* !GROUP_IOSCHED */ +static inline int io_group_allow_merge(struct request *rq, struct bio *bio) +{ + return 1; +} + +static inline void elv_get_iog(struct io_group *iog) +{ +} + +static inline void elv_put_iog(struct io_group *iog) +{ +} + +static inline int update_requeue(struct io_queue *ioq, int requeue) +{ + return requeue; +} + +static inline void io_group_set_ioq(struct io_group *iog, struct io_queue *ioq) +{ +} + +static inline int elv_fq_set_request_ioq(struct request_queue *q, + struct request *rq, struct bio *bio, gfp_t gfp_mask) +{ + return 0; +} + +static inline void elv_fq_unset_request_ioq(struct request_queue *q, + struct request *rq) +{ +} + +static inline struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, + struct bio *bio) +{ + return NULL; +} + +#endif /* GROUP_IOSCHED */ + +extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name); +extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name, + size_t count); +extern ssize_t elv_slice_async_show(struct elevator_queue *q, char *name); +extern ssize_t elv_slice_async_store(struct elevator_queue *q, const char *name, + size_t count); +extern ssize_t elv_fairness_show(struct elevator_queue *q, char *name); +extern ssize_t elv_fairness_store(struct elevator_queue *q, const char *name, + size_t count); +/* Functions used by elevator.c */ +extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e); +extern void elv_exit_fq_data(struct elevator_queue *e); +extern void elv_exit_fq_data_post(struct elevator_queue *e); + +extern void elv_ioq_request_add(struct request_queue *q, struct request *rq); +extern void elv_ioq_request_removed(struct elevator_queue *e, + struct request *rq); +extern void elv_fq_dispatched_request(struct elevator_queue *e, + struct request *rq); + +extern void elv_fq_activate_rq(struct request_queue *q, struct request *rq); +extern void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq); + +extern void elv_ioq_completed_request(struct request_queue *q, + struct request *rq); + +extern void *elv_fq_select_ioq(struct request_queue *q, int force); + +/* Functions used by io schedulers */ +extern void elv_put_ioq(struct io_queue *ioq); +extern void __elv_ioq_slice_expired(struct request_queue *q, + struct io_queue *ioq); +extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq, + pid_t pid, int is_sync); +extern void elv_init_ioq_io_group(struct elevator_queue *eq, + struct io_queue *ioq, void *iog); +extern void elv_init_ioq_sched_queue(struct elevator_queue *eq, + struct io_queue *ioq, void *sched_queue); +extern void elv_init_ioq_prio_data(struct elevator_queue *eq, + struct io_queue *ioq, int ioprio_class, int ioprio); +extern struct io_queue *elv_get_oom_ioq(struct elevator_queue *eq); +extern void elv_schedule_dispatch(struct request_queue *q); +extern void *elv_active_sched_queue(struct elevator_queue *e); +extern int elv_mod_idle_slice_timer(struct elevator_queue *eq, + unsigned long expires); +extern int elv_del_idle_slice_timer(struct elevator_queue *eq); +extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class, + int ioprio); +extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class, + int ioprio, struct io_queue *ioq); +extern struct io_group *io_get_io_group(struct request_queue *q, + struct page *page, int create); +extern struct io_group *io_get_io_group_bio(struct request_queue *q, + struct bio *bio, int create); +extern int elv_nr_busy_ioq(struct elevator_queue *e); +extern int elv_rq_in_driver(struct elevator_queue *e); +extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask); +extern void elv_free_ioq(struct io_queue *ioq); + +#else /* CONFIG_ELV_FAIR_QUEUING */ + +static inline int elv_init_fq_data(struct request_queue *q, + struct elevator_queue *e) +{ + return 0; +} + +static inline void elv_exit_fq_data(struct elevator_queue *e) {} +static inline void elv_exit_fq_data_post(struct elevator_queue *e) {} + +static inline void elv_fq_activate_rq(struct request_queue *q, + struct request *rq) +{ +} + +static inline void elv_fq_deactivate_rq(struct request_queue *q, + struct request *rq) +{ +} + +static inline void elv_fq_dispatched_request(struct elevator_queue *e, + struct request *rq) +{ +} + +static inline void elv_ioq_request_removed(struct elevator_queue *e, + struct request *rq) +{ +} + +static inline void elv_ioq_request_add(struct request_queue *q, + struct request *rq) +{ +} + +static inline void elv_ioq_completed_request(struct request_queue *q, + struct request *rq) +{ +} + +static inline void *ioq_sched_queue(struct io_queue *ioq) { return NULL; } +static inline void *elv_fq_select_ioq(struct request_queue *q, int force) +{ + return NULL; +} + +static inline int io_group_allow_merge(struct request *rq, struct bio *bio) + +{ + return 1; +} +static inline int elv_fq_set_request_ioq(struct request_queue *q, + struct request *rq, struct bio *bio, gfp_t gfp_mask) +{ + return 0; +} + +static inline void elv_fq_unset_request_ioq(struct request_queue *q, + struct request *rq) +{ +} + +static inline struct io_queue *elv_lookup_ioq_bio(struct request_queue *q, + struct bio *bio) +{ + return NULL; +} + +#endif /* CONFIG_ELV_FAIR_QUEUING */ +#endif /* _BFQ_SCHED_H */ diff --git a/block/elevator.c b/block/elevator.c index 2d511f9..285719c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -113,6 +113,10 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) !bio_failfast_driver(bio) != !blk_failfast_driver(rq)) return 0; + /* If rq and bio belongs to different groups, dont allow merging */ + if (!io_group_allow_merge(rq, bio)) + return 0; + if (!elv_iosched_allow_merge(rq, bio)) return 0; @@ -184,17 +188,62 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static void *elevator_init_queue(struct request_queue *q, - struct elevator_queue *eq) +static void * +elevator_init_data(struct request_queue *q, struct elevator_queue *eq) +{ + void *data = NULL; + + if (eq->ops->elevator_init_fn) { + data = eq->ops->elevator_init_fn(q, eq); + if (data) + return data; + else + return ERR_PTR(-ENOMEM); + } + + /* IO scheduler does not instanciate data (noop), it is not an error */ + return NULL; +} + +static void +elevator_free_sched_queue(struct elevator_queue *eq, void *sched_queue) +{ + /* Not all io schedulers (cfq) strore sched_queue */ + if (!sched_queue) + return; + eq->ops->elevator_free_sched_queue_fn(eq, sched_queue); +} + +static void * +elevator_alloc_sched_queue(struct request_queue *q, struct elevator_queue *eq) { - return eq->ops->elevator_init_fn(q); + void *sched_queue = NULL; + + /* + * If fair queuing is enabled, then queue allocation takes place + * during set_request() functions when request actually comes + * in. + */ + if (elv_iosched_fair_queuing_enabled(eq)) + return NULL; + + if (eq->ops->elevator_alloc_sched_queue_fn) { + sched_queue = eq->ops->elevator_alloc_sched_queue_fn(q, eq, + GFP_KERNEL, NULL); + if (!sched_queue) + return ERR_PTR(-ENOMEM); + + } + + return sched_queue; } static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, - void *data) + void *data, void *sched_queue) { q->elevator = eq; eq->elevator_data = data; + eq->sched_queue = sched_queue; } static char chosen_elevator[16]; @@ -239,6 +288,9 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, for (i = 0; i < ELV_HASH_ENTRIES; i++) INIT_HLIST_HEAD(&eq->hash[i]); + if (elv_init_fq_data(q, eq)) + goto err; + return eq; err: kfree(eq); @@ -261,7 +313,7 @@ int elevator_init(struct request_queue *q, char *name) struct elevator_type *e = NULL; struct elevator_queue *eq; int ret = 0; - void *data; + void *data = NULL, *sched_queue = NULL; INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; @@ -295,13 +347,21 @@ int elevator_init(struct request_queue *q, char *name) if (!eq) return -ENOMEM; - data = elevator_init_queue(q, eq); - if (!data) { + data = elevator_init_data(q, eq); + + if (IS_ERR(data)) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + + sched_queue = elevator_alloc_sched_queue(q, eq); + + if (IS_ERR(sched_queue)) { kobject_put(&eq->kobj); return -ENOMEM; } - elevator_attach(q, eq, data); + elevator_attach(q, eq, data, sched_queue); return ret; } EXPORT_SYMBOL(elevator_init); @@ -309,9 +369,12 @@ EXPORT_SYMBOL(elevator_init); void elevator_exit(struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); + elevator_free_sched_queue(e, e->sched_queue); + elv_exit_fq_data(e); if (e->ops->elevator_exit_fn) e->ops->elevator_exit_fn(e); e->ops = NULL; + elv_exit_fq_data_post(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -438,6 +501,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) elv_rqhash_del(q, rq); q->nr_sorted--; + elv_fq_dispatched_request(q->elevator, rq); boundary = q->end_sector; stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; @@ -478,6 +542,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) elv_rqhash_del(q, rq); q->nr_sorted--; + elv_fq_dispatched_request(q->elevator, rq); q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; @@ -545,6 +610,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, elv_rqhash_del(q, next); q->nr_sorted--; + elv_ioq_request_removed(e, next); q->last_merge = rq; } @@ -593,7 +659,7 @@ void elv_quiesce_start(struct request_queue *q) * make sure we don't have any requests in flight */ elv_drain_elevator(q); - while (q->rq.elvpriv) { + while (q->rq_data.elvpriv) { __blk_run_queue(q); spin_unlock_irq(q->queue_lock); msleep(10); @@ -651,12 +717,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) q->last_merge = rq; } - /* - * Some ioscheds (cfq) run q->request_fn directly, so - * rq cannot be accessed after calling - * elevator_add_req_fn. - */ q->elevator->ops->elevator_add_req_fn(q, rq); + elv_ioq_request_add(q, rq); break; case ELEVATOR_INSERT_REQUEUE: @@ -696,8 +758,9 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) } if (unplug_it && blk_queue_plugged(q)) { - int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - - queue_in_flight(q); + int nrq = q->rq_data.count[BLK_RW_SYNC] + + q->rq_data.count[BLK_RW_ASYNC] - + queue_in_flight(q); if (nrq >= q->unplug_thresh) __generic_unplug_device(q); @@ -755,13 +818,12 @@ EXPORT_SYMBOL(elv_add_request); int elv_queue_empty(struct request_queue *q) { - struct elevator_queue *e = q->elevator; - if (!list_empty(&q->queue_head)) return 0; - if (e->ops->elevator_queue_empty_fn) - return e->ops->elevator_queue_empty_fn(q); + /* Hopefully nr_sorted works and no need to call queue_empty_fn */ + if (q->nr_sorted) + return 0; return 1; } @@ -785,12 +847,20 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +int elv_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) { struct elevator_queue *e = q->elevator; + /* + * Optimization for noop, deadline and AS which maintain only single + * ioq per io group + */ + if (elv_iosched_single_ioq(e)) + return elv_fq_set_request_ioq(q, rq, bio, gfp_mask); + if (e->ops->elevator_set_req_fn) - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); + return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); rq->elevator_private = NULL; return 0; @@ -800,6 +870,15 @@ void elv_put_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + /* + * Optimization for noop, deadline and AS which maintain only single + * ioq per io group + */ + if (elv_iosched_single_ioq(e)) { + elv_fq_unset_request_ioq(q, rq); + return; + } + if (e->ops->elevator_put_req_fn) e->ops->elevator_put_req_fn(rq); } @@ -841,8 +920,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) - e->ops->elevator_completed_req_fn(q, rq); + if (blk_sorted_rq(rq)) { + if (e->ops->elevator_completed_req_fn) + e->ops->elevator_completed_req_fn(q, rq); + elv_ioq_completed_request(q, rq); + } } /* @@ -995,7 +1077,7 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old_elevator, *e; - void *data; + void *data = NULL, *sched_queue = NULL; /* * Allocate new elevator @@ -1004,10 +1086,18 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (!e) return 0; - data = elevator_init_queue(q, e); - if (!data) { + data = elevator_init_data(q, e); + + if (IS_ERR(data)) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; + } + + sched_queue = elevator_alloc_sched_queue(q, e); + + if (IS_ERR(sched_queue)) { + kobject_put(&e->kobj); + return -ENOMEM; } /* @@ -1024,7 +1114,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * attach and start new elevator */ - elevator_attach(q, e, data); + elevator_attach(q, e, data, sched_queue); spin_unlock_irq(q->queue_lock); @@ -1138,3 +1228,53 @@ struct request *elv_rb_latter_request(struct request_queue *q, return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); + +/* Get the io scheduler queue pointer. */ +void *elv_get_sched_queue(struct request_queue *q, struct request *rq) +{ + /* + * io scheduler is not using fair queuing. Return sched_queue + * pointer stored in elevator_queue. It will be null if io + * scheduler never stored anything there to begin with (cfq) + */ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + /* + * IO schedueler is using fair queuing infrasture. If io scheduler + * has passed a non null rq, retrieve sched_queue pointer from + * there. */ + if (rq) + return ioq_sched_queue(req_ioq(rq)); + + return NULL; +} +EXPORT_SYMBOL(elv_get_sched_queue); + +/* Select an ioscheduler queue to dispatch request from. */ +void *elv_select_sched_queue(struct request_queue *q, int force) +{ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + return ioq_sched_queue(elv_fq_select_ioq(q, force)); +} +EXPORT_SYMBOL(elv_select_sched_queue); + +/* + * Get the io scheduler queue pointer for the group bio belongs to. + * + * If fair queuing is enabled, determine the io group of task and retrieve + * the ioq pointer from that. This is used by only single queue ioschedulers + * for retrieving the queue associated with the group to decide whether the + * new bio can do a front merge or not. + */ +void *elv_get_sched_queue_bio(struct request_queue *q, struct bio *bio) +{ + /* Fair queuing is not enabled. There is only one queue. */ + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return q->elevator->sched_queue; + + return ioq_sched_queue(elv_lookup_ioq_bio(q, bio)); +} +EXPORT_SYMBOL(elv_get_sched_queue_bio); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3a0d369..190f37b 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -7,7 +7,7 @@ #include #include -struct noop_data { +struct noop_queue { struct list_head queue; }; @@ -19,11 +19,14 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_select_sched_queue(q, force); - if (!list_empty(&nd->queue)) { + if (!nq) + return 0; + + if (!list_empty(&nq->queue)) { struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_entry(nq->queue.next, struct request, queuelist); list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -33,24 +36,17 @@ static int noop_dispatch(struct request_queue *q, int force) static void noop_add_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; - - list_add_tail(&rq->queuelist, &nd->queue); -} - -static int noop_queue_empty(struct request_queue *q) -{ - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - return list_empty(&nd->queue); + list_add_tail(&rq->queuelist, &nq->queue); } static struct request * noop_former_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - if (rq->queuelist.prev == &nd->queue) + if (rq->queuelist.prev == &nq->queue) return NULL; return list_entry(rq->queuelist.prev, struct request, queuelist); } @@ -58,43 +54,58 @@ noop_former_request(struct request_queue *q, struct request *rq) static struct request * noop_latter_request(struct request_queue *q, struct request *rq) { - struct noop_data *nd = q->elevator->elevator_data; + struct noop_queue *nq = elv_get_sched_queue(q, rq); - if (rq->queuelist.next == &nd->queue) + if (rq->queuelist.next == &nq->queue) return NULL; return list_entry(rq->queuelist.next, struct request, queuelist); } -static void *noop_init_queue(struct request_queue *q) +static void *noop_alloc_noop_queue(struct request_queue *q, + struct elevator_queue *eq, gfp_t gfp_mask, struct io_queue *ioq) { - struct noop_data *nd; + struct noop_queue *nq; - nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); - if (!nd) - return NULL; - INIT_LIST_HEAD(&nd->queue); - return nd; + nq = kmalloc_node(sizeof(*nq), gfp_mask | __GFP_ZERO, q->node); + if (nq == NULL) + goto out; + + INIT_LIST_HEAD(&nq->queue); +out: + return nq; } -static void noop_exit_queue(struct elevator_queue *e) +static void noop_free_noop_queue(struct elevator_queue *e, void *sched_queue) { - struct noop_data *nd = e->elevator_data; + struct noop_queue *nq = sched_queue; - BUG_ON(!list_empty(&nd->queue)); - kfree(nd); + kfree(nq); } +#ifdef CONFIG_IOSCHED_NOOP_HIER +static struct elv_fs_entry noop_attrs[] = { + ELV_ATTR(fairness), + ELV_ATTR(slice_sync), + ELV_ATTR(group_idle), + ELV_ATTR(map_sync), + __ATTR_NULL +}; +#endif + static struct elevator_type elevator_noop = { .ops = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, - .elevator_queue_empty_fn = noop_queue_empty, .elevator_former_req_fn = noop_former_request, .elevator_latter_req_fn = noop_latter_request, - .elevator_init_fn = noop_init_queue, - .elevator_exit_fn = noop_exit_queue, + .elevator_alloc_sched_queue_fn = noop_alloc_noop_queue, + .elevator_free_sched_queue_fn = noop_free_noop_queue, }, +#ifdef CONFIG_IOSCHED_NOOP_HIER + .elevator_features = ELV_IOSCHED_NEED_FQ | ELV_IOSCHED_SINGLE_IOQ, + .elevator_attrs = noop_attrs, +#endif .elevator_name = "noop", .elevator_owner = THIS_MODULE, }; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2cba557..2dc0e4f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1175,7 +1175,8 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } -int dm_table_any_congested(struct dm_table *t, int bdi_bits) +int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page, + int group) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); @@ -1185,9 +1186,11 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); char b[BDEVNAME_SIZE]; - if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); - else + if (likely(q)) { + struct backing_dev_info *bdi = &q->backing_dev_info; + r |= group ? bdi_congested_group(bdi, bdi_bits, page) + : bdi_congested(bdi, bdi_bits); + } else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9acd54a..f5d490b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1608,7 +1608,8 @@ static void dm_unplug_all(struct request_queue *q) } } -static int dm_any_congested(void *congested_data, int bdi_bits) +static int dm_any_congested(void *congested_data, int bdi_bits, + struct page *page, int group) { int r = bdi_bits; struct mapped_device *md = congested_data; @@ -1625,8 +1626,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) r = md->queue->backing_dev_info.state & bdi_bits; else - r = dm_table_any_congested(map, bdi_bits); - + r = dm_table_any_congested(map, bdi_bits, page, + group); dm_table_put(map); } } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 23278ae..9c4c5a5 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -57,7 +57,8 @@ struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); -int dm_table_any_congested(struct dm_table *t, int bdi_bits); +int dm_table_any_congested(struct dm_table *t, int bdi_bits, struct page *page, + int group); int dm_table_any_busy_target(struct dm_table *t); int dm_table_set_type(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5810fa9..ec3acc2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -102,7 +102,7 @@ static void linear_unplug(struct request_queue *q) rcu_read_unlock(); } -static int linear_congested(void *data, int bits) +static int linear_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; linear_conf_t *conf; @@ -113,7 +113,10 @@ static int linear_congested(void *data, int bits) for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + struct backing_dev_info *bdi = &q->backing_dev_info; + + ret |= group ? bdi_congested_group(bdi, bits, page) : + bdi_congested(bdi, bits); } rcu_read_unlock(); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 237fe3f..ab96712 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -192,7 +192,8 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(void *data, int bits, struct page *page, + int group) { mddev_t *mddev = data; multipath_conf_t *conf = mddev->private; @@ -203,8 +204,10 @@ static int multipath_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); /* Just like multipath_map, we just check the * first available device */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 335f490..b50c11b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -37,7 +37,7 @@ static void raid0_unplug(struct request_queue *q) } } -static int raid0_congested(void *data, int bits) +static int raid0_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; raid0_conf_t *conf = mddev->private; @@ -46,8 +46,10 @@ static int raid0_congested(void *data, int bits) for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); } return ret; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0569efb..3f30375 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -570,7 +570,7 @@ static void raid1_unplug(struct request_queue *q) md_wakeup_thread(mddev->thread); } -static int raid1_congested(void *data, int bits) +static int raid1_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; conf_t *conf = mddev->private; @@ -581,14 +581,17 @@ static int raid1_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ if ((bits & (1<backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, + page) : bdi_congested(bdi, bits); else - ret &= bdi_congested(&q->backing_dev_info, bits); + ret &= group ? bdi_congested_group(bdi, bits, + page) : bdi_congested(bdi, bits); } } rcu_read_unlock(); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7298a5e..895f5fb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -625,7 +625,7 @@ static void raid10_unplug(struct request_queue *q) md_wakeup_thread(mddev->thread); } -static int raid10_congested(void *data, int bits) +static int raid10_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; conf_t *conf = mddev->private; @@ -636,8 +636,10 @@ static int raid10_congested(void *data, int bits) mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + struct backing_dev_info *bdi = &q->backing_dev_info; - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= group ? bdi_congested_group(bdi, bits, page) + : bdi_congested(bdi, bits); } } rcu_read_unlock(); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3783553..a19937c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3323,7 +3323,7 @@ static void raid5_unplug_device(struct request_queue *q) unplug_slaves(mddev); } -static int raid5_congested(void *data, int bits) +static int raid5_congested(void *data, int bits, struct page *page, int group) { mddev_t *mddev = data; raid5_conf_t *conf = mddev->private; diff --git a/fs/afs/write.c b/fs/afs/write.c index c2e7a7f..aa8b359 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -455,7 +455,7 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) + if (wbc->nonblocking && bdi_or_group_write_congested(bdi, page)) wbc->encountered_congestion = 1; _leave(" = 0"); @@ -491,6 +491,12 @@ static int afs_writepages_region(struct address_space *mapping, return 0; } + if (wbc->nonblocking && bdi_write_congested_group(bdi, page)) { + wbc->encountered_congestion = 1; + page_cache_release(page); + break; + } + /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c..cd7cf6c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1249,7 +1249,8 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, return root; } -static int btrfs_congested_fn(void *congested_data, int bdi_bits) +static int btrfs_congested_fn(void *congested_data, int bdi_bits, + struct page *page, int group) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; @@ -1260,7 +1261,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi && (group ? bdi_congested_group(bdi, bdi_bits, page) : + bdi_congested(bdi, bdi_bits))) { ret = 1; break; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6826018..fd7d53f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2368,6 +2368,18 @@ retry: unsigned i; scanned = 1; + + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9..7ab5dea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -165,6 +165,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + struct page *page; bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; @@ -276,8 +277,11 @@ loop_lock: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && batch_run > 32 && - fs_info->fs_devices->open_devices > 1) { + if (pending) + page = bio_iovec_idx(pending, 0)->bv_page; + + if (pending && bdi_or_group_write_congested(bdi, page) && + num_run > 32 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; ioc = current->io_context; diff --git a/fs/buffer.c b/fs/buffer.c index a3ef091..cb68608 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -668,6 +669,7 @@ static void __set_page_dirty(struct page *page, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c34b7f8..33d0339 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1470,6 +1470,17 @@ retry: n_iov = 0; bytes_to_write = 0; + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking && + bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { page = pvec.pages[i]; /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 8b10b87..185ba0a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -797,6 +798,7 @@ static int do_direct_IO(struct dio *dio) ret = PTR_ERR(page); goto out; } + blkio_cgroup_reset_owner(page, current->mm); while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9..090a961 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -179,7 +179,7 @@ static void ext2_preread_inode(struct inode *inode) struct backing_dev_info *bdi; bdi = inode->i_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_or_group_read_congested(bdi, NULL)) return; if (bdi_write_congested(bdi)) return; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 03ebb43..5b9c93b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -371,6 +371,18 @@ retry: PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { scanned = 1; + + /* + * If io group page belongs to is congested. bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); if (ret) done = 1; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17..aa29612 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -266,8 +266,9 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) { struct bio *bio = wi->bio; int err; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; - if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { + if (wi->nbio > 0 && bdi_or_group_write_congested(wi->bdi, page)) { wait_for_completion(&wi->bio_event); wi->nbio--; if (unlikely(atomic_read(&wi->err))) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7ec89fc..2a515ab 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -891,7 +891,7 @@ xfs_convert_page( bdi = inode->i_mapping->backing_dev_info; wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { + if (bdi_or_group_write_congested(bdi, page)) { wbc->encountered_congestion = 1; done = 1; } else if (wbc->nr_to_write <= 0) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 0c93c7e..74d8776 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -714,7 +714,7 @@ xfs_buf_readahead( struct backing_dev_info *bdi; bdi = target->bt_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_or_group_read_congested(bdi, NULL)) return; flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1d52425..d7916f3 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ enum bdi_state { BDI_unused, /* Available bits start here */ }; -typedef int (congested_fn)(void *, int); +typedef int (congested_fn)(void *, int, struct page *, int); enum bdi_stat_item { BDI_RECLAIMABLE, @@ -209,7 +209,7 @@ int writeback_in_progress(struct backing_dev_info *bdi); static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); + return bdi->congested_fn(bdi->congested_data, bdi_bits, NULL, 0); return (bdi->state & bdi_bits); } @@ -229,6 +229,63 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << BDI_async_congested)); } +#ifdef CONFIG_GROUP_IOSCHED +extern int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits, + struct page *page); + +extern int bdi_read_congested_group(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_write_congested_group(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page); + +extern int bdi_rw_congested_group(struct backing_dev_info *bdi, + struct page *page); +#else /* CONFIG_GROUP_IOSCHED */ +static inline int bdi_congested_group(struct backing_dev_info *bdi, + int bdi_bits, struct page *page) +{ + return bdi_congested(bdi, bdi_bits); +} + +static inline int bdi_read_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi); +} + +static inline int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi); +} + +static inline int bdi_write_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi); +} + +static inline int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi); +} + +static inline int bdi_rw_congested_group(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_rw_congested(bdi); +} + +#endif /* CONFIG_GROUP_IOSCHED */ + enum { BLK_RW_ASYNC = 0, BLK_RW_SYNC = 1, diff --git a/include/linux/biotrack.h b/include/linux/biotrack.h new file mode 100644 index 0000000..0b4491a --- /dev/null +++ b/include/linux/biotrack.h @@ -0,0 +1,103 @@ +#include +#include +#include + +#ifndef _LINUX_BIOTRACK_H +#define _LINUX_BIOTRACK_H + +#ifdef CONFIG_CGROUP_BLKIO + +struct io_context; +struct block_device; + +struct blkio_cgroup { + struct cgroup_subsys_state css; + struct io_context *io_context; /* default io_context */ +/* struct radix_tree_root io_context_root; per device io_context */ +}; + +/** + * __init_blkio_page_cgroup() - initialize a blkio_page_cgroup + * @pc: page_cgroup of the page + * + * Reset the owner ID of a page. + */ +static inline void __init_blkio_page_cgroup(struct page_cgroup *pc) +{ + lock_page_cgroup(pc); + page_cgroup_set_id(pc, 0); + unlock_page_cgroup(pc); +} + +/** + * blkio_cgroup_disabled - check whether blkio_cgroup is disabled + * + * Returns true if disabled, false if not. + */ +static inline bool blkio_cgroup_disabled(void) +{ + if (blkio_cgroup_subsys.disabled) + return true; + return false; +} + +extern void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm); +extern void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm); +extern void blkio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm); +extern void blkio_cgroup_copy_owner(struct page *page, struct page *opage); + +extern struct io_context *get_blkio_cgroup_iocontext(struct bio *bio); +extern unsigned long get_blkio_cgroup_id(struct bio *bio); +extern unsigned long get_blkio_cgroup_id_page(struct page *page); +extern struct cgroup *blkio_cgroup_lookup(int id); + +#else /* CONFIG_CGROUP_BIO */ + +struct blkio_cgroup; + +static inline void __init_blkio_page_cgroup(struct page_cgroup *pc) +{ +} + +static inline bool blkio_cgroup_disabled(void) +{ + return true; +} + +static inline void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_reset_owner(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void blkio_cgroup_copy_owner(struct page *page, struct page *opage) +{ +} + +static inline struct io_context *get_blkio_cgroup_iocontext(struct bio *bio) +{ + return NULL; +} + +static inline unsigned long get_blkio_cgroup_id(struct bio *bio) +{ + return 0; +} + +static inline unsigned long get_blkio_cgroup_id_page(struct page *page) +{ + return 0; +} + +#endif /* CONFIG_CGROUP_BLKIO */ + +#endif /* _LINUX_BIOTRACK_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e7cb5db..8b4370a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -32,21 +32,51 @@ struct request; struct sg_io_hdr; #define BLKDEV_MIN_RQ 4 + +#ifdef CONFIG_GROUP_IOSCHED +#define BLKDEV_MAX_RQ 512 /* Default maximum for queue */ +#define BLKDEV_MAX_GROUP_RQ 128 /* Default maximum per group*/ +#else #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* + * This is eqivalent to case of only one group present (root group). Let + * it consume all the request descriptors available on the queue . + */ +#define BLKDEV_MAX_GROUP_RQ BLKDEV_MAX_RQ /* Default maximum */ +#endif struct request; typedef void (rq_end_io_fn)(struct request *, int); struct request_list { /* - * count[], starved[], and wait[] are indexed by + * count[], starved and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC */ int count[2]; int starved[2]; + wait_queue_head_t wait[2]; +}; + +/* + * This data structures keeps track of mempool of requests for the queue + * and some overall statistics. + */ +struct request_data { + /* + * Per queue request descriptor count. This is in addition to per + * cgroup count + */ + int count[2]; int elvpriv; mempool_t *rq_pool; - wait_queue_head_t wait[2]; + int starved; + /* + * Global list for starved tasks. A task will be queued here if + * it could not allocate request descriptor and the associated + * group request list does not have any requests pending. + */ + wait_queue_head_t starved_wait; }; /* @@ -229,6 +259,11 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_ELV_FAIR_QUEUING + /* io queue request belongs to */ + struct io_queue *ioq; +#endif }; static inline unsigned short req_get_ioprio(struct request *req) @@ -236,6 +271,15 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } +static inline struct io_queue *req_ioq(struct request *req) +{ +#ifdef CONFIG_ELV_FAIR_QUEUING + return req->ioq; +#else + return NULL; +#endif +} + /* * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME * requests. Some step values could eventually be made generic. @@ -330,6 +374,9 @@ struct request_queue */ struct request_list rq; + /* Contains request pool and other data like starved data */ + struct request_data rq_data; + request_fn_proc *request_fn; make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; @@ -391,6 +438,8 @@ struct request_queue * queue settings */ unsigned long nr_requests; /* Max # of requests */ + /* Max # of per io group requests */ + unsigned long nr_group_requests; unsigned int nr_congestion_on; unsigned int nr_congestion_off; unsigned int nr_batching; @@ -770,6 +819,54 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); +extern void blk_init_request_list(struct request_list *rl); + +static inline struct request_list *blk_get_request_list(struct request_queue *q, + struct bio *bio) +{ +#ifdef CONFIG_GROUP_IOSCHED + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return &q->rq; + + return io_group_get_request_list(q, bio); +#else + return &q->rq; +#endif +} + +static inline struct request_list *rq_rl(struct request_queue *q, + struct request *rq) +{ +#ifdef CONFIG_GROUP_IOSCHED + struct io_group *iog; + int priv = rq->cmd_flags & REQ_ELVPRIV; + + if (!elv_iosched_fair_queuing_enabled(q->elevator)) + return &q->rq; + + BUG_ON(priv && !rq->ioq); + + if (priv) + iog = ioq_to_io_group(rq->ioq); + else + iog = q->elevator->efqd.root_group; + + BUG_ON(!iog); + return &iog->rl; +#else + return &q->rq; +#endif +} + +static inline struct io_group *rl_iog(struct request_list *rl) +{ +#ifdef CONFIG_GROUP_IOSCHED + return container_of(rl, struct io_group, rl); +#else + return NULL; +#endif +} + /* * A queue has just exitted congestion. Note this in the global counter of * congested queues, and wake up anyone who was waiting for requests to be @@ -789,6 +886,11 @@ static inline void blk_set_queue_congested(struct request_queue *q, int sync) set_bdi_congested(&q->backing_dev_info, sync); } +#ifdef CONFIG_GROUP_IOSCHED +extern int blk_queue_io_group_congested(struct backing_dev_info *bdi, + int bdi_bits, struct page *page); +#endif + extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..78504f3 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -43,6 +43,12 @@ SUBSYS(mem_cgroup) /* */ +#ifdef CONFIG_CGROUP_BLKIO +SUBSYS(blkio_cgroup) +#endif + +/* */ + #ifdef CONFIG_CGROUP_DEVICE SUBSYS(devices) #endif @@ -60,3 +66,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_GROUP_IOSCHED +SUBSYS(io) +#endif + +/* */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1cb3372..e98c098 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -2,6 +2,7 @@ #define _LINUX_ELEVATOR_H #include +#include "../../block/elevator-fq.h" #ifdef CONFIG_BLOCK @@ -22,13 +23,28 @@ typedef struct request *(elevator_request_list_fn) (struct request_queue *, stru typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); typedef int (elevator_may_queue_fn) (struct request_queue *, int); -typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); +typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, + struct bio *bio, gfp_t); typedef void (elevator_put_req_fn) (struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); -typedef void *(elevator_init_fn) (struct request_queue *); +typedef void *(elevator_init_fn) (struct request_queue *, + struct elevator_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *); +typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *); +typedef void* (elevator_alloc_sched_queue_fn) (struct request_queue *q, + struct elevator_queue *eq, gfp_t, struct io_queue *ioq); +#ifdef CONFIG_ELV_FAIR_QUEUING +typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *, int); +typedef void (elevator_active_ioq_reset_fn) (struct request_queue *, void*); +typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*); +typedef int (elevator_should_preempt_fn) (struct request_queue*, void*, + struct request*); +typedef struct io_queue* (elevator_close_cooperator_fn) (struct request_queue*, + void*); +typedef int (elevator_expire_ioq_fn) (struct request_queue*, void *, int, int); +#endif struct elevator_ops { @@ -56,6 +72,18 @@ struct elevator_ops elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; void (*trim)(struct io_context *); + + elevator_alloc_sched_queue_fn *elevator_alloc_sched_queue_fn; + elevator_free_sched_queue_fn *elevator_free_sched_queue_fn; +#ifdef CONFIG_ELV_FAIR_QUEUING + elevator_active_ioq_set_fn *elevator_active_ioq_set_fn; + elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn; + + elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn; + elevator_should_preempt_fn *elevator_should_preempt_fn; + elevator_close_cooperator_fn *elevator_close_cooperator_fn; + elevator_expire_ioq_fn *elevator_expire_ioq_fn; +#endif }; #define ELV_NAME_MAX (16) @@ -76,6 +104,9 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; +#ifdef CONFIG_ELV_FAIR_QUEUING + int elevator_features; +#endif }; /* @@ -85,10 +116,15 @@ struct elevator_queue { struct elevator_ops *ops; void *elevator_data; + void *sched_queue; struct kobject kobj; struct elevator_type *elevator_type; struct mutex sysfs_lock; struct hlist_head *hash; +#ifdef CONFIG_ELV_FAIR_QUEUING + /* fair queuing data */ + struct elv_fq_data efqd; +#endif }; /* @@ -112,7 +148,8 @@ extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, int); extern void elv_abort_queue(struct request_queue *); extern void elv_completed_request(struct request_queue *, struct request *); -extern int elv_set_request(struct request_queue *, struct request *, gfp_t); +extern int elv_set_request(struct request_queue *, struct request *, + struct bio *bio, gfp_t); extern void elv_put_request(struct request_queue *, struct request *); extern void elv_drain_elevator(struct request_queue *); @@ -207,5 +244,54 @@ enum { __val; \ }) +/* iosched can let elevator know their feature set/capability */ +#ifdef CONFIG_ELV_FAIR_QUEUING + +/* iosched wants to use fair queuing logic of elevator layer */ +#define ELV_IOSCHED_NEED_FQ 1 + +/* iosched maintains only single ioq per group.*/ +#define ELV_IOSCHED_SINGLE_IOQ 2 + +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e) +{ + return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ; +} + +static inline int elv_iosched_single_ioq(struct elevator_queue *e) +{ + return (e->elevator_type->elevator_features) & ELV_IOSCHED_SINGLE_IOQ; +} + +#else /* ELV_IOSCHED_FAIR_QUEUING */ + +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e) +{ + return 0; +} + +static inline int elv_iosched_single_ioq(struct elevator_queue *e) +{ + return 0; +} + +#endif /* ELV_IOSCHED_FAIR_QUEUING */ +extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq); +extern void *elv_select_sched_queue(struct request_queue *q, int force); +extern void *elv_get_sched_queue_bio(struct request_queue *q, struct bio *bio); + +/* + * This is equivalent of rq_is_sync()/cfq_bio_sync() function where we + * determine whether an rq/bio is sync or not. There are cases like during + * merging and during * request allocation, where we don't have rq but bio + * and needs to find out * if this bio will be considered as sync or async by + * elevator/iosched. This function is useful in such cases. + */ +static inline int elv_bio_sync(struct bio *bio) +{ + if ((bio_data_dir(bio) == READ) || bio_sync(bio)) + return 1; + return 0; +} #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index dd05434..08b86d2 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -73,6 +73,11 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_GROUP_IOSCHED + /* If task changes the cgroup, elevator processes it asynchronously */ + unsigned short cgroup_changed; +#endif + /* * For request batching */ @@ -104,6 +109,7 @@ int put_io_context(struct io_context *ioc); void exit_io_context(void); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +void init_io_context(struct io_context *ioc); void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else static inline void exit_io_context(void) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e46a073..eb45fe9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -37,6 +37,8 @@ struct mm_struct; * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ +extern void __init_mem_page_cgroup(struct page_cgroup *pc); + extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ @@ -121,6 +123,10 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; +static inline void __init_mem_page_cgroup(struct page_cgroup *pc) +{ +} + static inline int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8895985..c9d1ed4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -605,7 +605,7 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE struct page_cgroup *node_page_cgroup; #endif #endif @@ -956,7 +956,7 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE /* * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use * section. (see memcontrol.h/page_cgroup.h about this.) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 13f126c..f470fd2 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,7 +1,7 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE #include /* * Page Cgroup can be considered as an extended mem_map. @@ -12,9 +12,11 @@ */ struct page_cgroup { unsigned long flags; - struct mem_cgroup *mem_cgroup; struct page *page; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + struct mem_cgroup *mem_cgroup; struct list_head lru; /* per cgroup LRU list */ +#endif }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); @@ -83,7 +85,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_CGROUP_PAGE */ struct page_cgroup; static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -138,4 +140,27 @@ static inline void swap_cgroup_swapoff(int type) } #endif + +#ifdef CONFIG_CGROUP_BLKIO +/* + * use lower 16 bits for flags and reserve the rest for the page tracking id + */ +#define PCG_TRACKING_ID_SHIFT (16) +#define PCG_TRACKING_ID_BITS \ + (8 * sizeof(unsigned long) - PCG_TRACKING_ID_SHIFT) + +/* NOTE: must be called with page_cgroup() held */ +static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc) +{ + return pc->flags >> PCG_TRACKING_ID_SHIFT; +} + +/* NOTE: must be called with page_cgroup() held */ +static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long id) +{ + WARN_ON(id >= (1UL << PCG_TRACKING_ID_BITS)); + pc->flags &= (1UL << PCG_TRACKING_ID_SHIFT) - 1; + pc->flags |= (unsigned long)(id << PCG_TRACKING_ID_SHIFT); +} +#endif #endif diff --git a/init/Kconfig b/init/Kconfig index cb2c092..98e49b8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -612,8 +612,31 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config GROUP_IOSCHED + bool + depends on CGROUPS && ELV_FAIR_QUEUING + default n + ---help--- + This feature lets IO scheduler recognize task groups and control + disk bandwidth allocation to such task groups. + endif # CGROUPS +config CGROUP_BLKIO + bool "Block I/O cgroup subsystem" + depends on CGROUPS && BLOCK + select MM_OWNER + help + Provides a Resource Controller which enables to track the onwner + of every Block I/O requests. + The information this subsystem provides can be used from any + kind of module such as dm-ioband device mapper modules or + the cfq-scheduler. + +config CGROUP_PAGE + def_bool y + depends on CGROUP_MEM_RES_CTLR || CGROUP_BLKIO + config MM_OWNER bool diff --git a/mm/Makefile b/mm/Makefile index 5e0bd64..6208744 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,6 +39,8 @@ else obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o +obj-$(CONFIG_CGROUP_BLKIO) += biotrack.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c86edd2..2f77b90 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -7,6 +7,7 @@ #include #include #include +#include "../block/elevator-fq.h" void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -327,3 +328,64 @@ long congestion_wait(int sync, long timeout) } EXPORT_SYMBOL(congestion_wait); +/* + * With group IO scheduling, there are request descriptors per io group per + * queue. So generic notion of whether queue is congested or not is not + * very accurate. Queue might not be congested but the io group in which + * request will go might actually be congested. + * + * Hence to get the correct idea about congestion level, one should query + * the io group congestion status on the queue. Pass in the page information + * which can be used to determine the io group of the page and congestion + * status can be determined accordingly. + * + * If page info is not passed, io group is determined from the current task + * context. + */ +#ifdef CONFIG_GROUP_IOSCHED +int bdi_congested_group(struct backing_dev_info *bdi, int bdi_bits, + struct page *page) +{ + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, bdi_bits, page, 1); + + return blk_queue_io_group_congested(bdi, bdi_bits, page); +} +EXPORT_SYMBOL(bdi_congested_group); + +int bdi_read_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, 1 << BDI_sync_congested, page); +} +EXPORT_SYMBOL(bdi_read_congested_group); + +/* Checks if either bdi or associated group is read congested */ +int bdi_or_group_read_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_read_congested(bdi) || bdi_read_congested_group(bdi, page); +} +EXPORT_SYMBOL(bdi_or_group_read_congested); + +int bdi_write_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, 1 << BDI_async_congested, page); +} +EXPORT_SYMBOL(bdi_write_congested_group); + +/* Checks if either bdi or associated group is write congested */ +int bdi_or_group_write_congested(struct backing_dev_info *bdi, + struct page *page) +{ + return bdi_write_congested(bdi) || bdi_write_congested_group(bdi, page); +} +EXPORT_SYMBOL(bdi_or_group_write_congested); + +int bdi_rw_congested_group(struct backing_dev_info *bdi, struct page *page) +{ + return bdi_congested_group(bdi, (1 << BDI_sync_congested) | + (1 << BDI_async_congested), page); +} +EXPORT_SYMBOL(bdi_rw_congested_group); + +#endif /* CONFIG_GROUP_IOSCHED */ diff --git a/mm/biotrack.c b/mm/biotrack.c new file mode 100644 index 0000000..320f511 --- /dev/null +++ b/mm/biotrack.c @@ -0,0 +1,321 @@ +/* biotrack.c - Block I/O Tracking + * + * Copyright (C) VA Linux Systems Japan, 2008-2009 + * Developed by Hirokazu Takahashi + * + * Copyright (C) 2008 Andrea Righi + * Use part of page_cgroup->flags to store blkio-cgroup ID. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +/* + * The block I/O tracking mechanism is implemented on the cgroup memory + * controller framework. It helps to find the the owner of an I/O request + * because every I/O request has a target page and the owner of the page + * can be easily determined on the framework. + */ + +/* Return the blkio_cgroup that associates with a cgroup. */ +static inline struct blkio_cgroup *cgroup_blkio(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, blkio_cgroup_subsys_id), + struct blkio_cgroup, css); +} + +/* Return the blkio_cgroup that associates with a process. */ +static inline struct blkio_cgroup *blkio_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, blkio_cgroup_subsys_id), + struct blkio_cgroup, css); +} + +static struct io_context default_blkio_io_context; +static struct blkio_cgroup default_blkio_cgroup = { + .io_context = &default_blkio_io_context, +}; + +/** + * blkio_cgroup_set_owner() - set the owner ID of a page. + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Make a given page have the blkio-cgroup ID of the owner of this page. + */ +void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ + struct blkio_cgroup *biog; + struct page_cgroup *pc; + unsigned long id; + + if (blkio_cgroup_disabled()) + return; + pc = lookup_page_cgroup(page); + if (unlikely(!pc)) + return; + + lock_page_cgroup(pc); + page_cgroup_set_id(pc, 0); /* 0: default blkio_cgroup id */ + unlock_page_cgroup(pc); + if (!mm) + return; + + rcu_read_lock(); + biog = blkio_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!biog)) { + rcu_read_unlock(); + return; + } + /* + * css_get(&bio->css) isn't called to increment the reference + * count of this blkio_cgroup "biog" so the css_id might turn + * invalid even if this page is still active. + * This approach is chosen to minimize the overhead. + */ + id = css_id(&biog->css); + rcu_read_unlock(); + lock_page_cgroup(pc); + page_cgroup_set_id(pc, id); + unlock_page_cgroup(pc); +} + +/** + * blkio_cgroup_reset_owner() - reset the owner ID of a page + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Change the owner of a given page if necessary. + */ +void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm) +{ + blkio_cgroup_set_owner(page, mm); +} + +/** + * blkio_cgroup_reset_owner_pagedirty() - reset the owner ID of a pagecache page + * @page: the page we want to tag + * @mm: the mm_struct of a page owner + * + * Change the owner of a given page if the page is in the pagecache. + */ +void blkio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm) +{ + if (!page_is_file_cache(page)) + return; + if (current->flags & PF_MEMALLOC) + return; + + blkio_cgroup_reset_owner(page, mm); +} + +/** + * blkio_cgroup_copy_owner() - copy the owner ID of a page into another page + * @npage: the page where we want to copy the owner + * @opage: the page from which we want to copy the ID + * + * Copy the owner ID of @opage into @npage. + */ +void blkio_cgroup_copy_owner(struct page *npage, struct page *opage) +{ + struct page_cgroup *npc, *opc; + unsigned long id; + + if (blkio_cgroup_disabled()) + return; + npc = lookup_page_cgroup(npage); + if (unlikely(!npc)) + return; + opc = lookup_page_cgroup(opage); + if (unlikely(!opc)) + return; + + lock_page_cgroup(opc); + lock_page_cgroup(npc); + id = page_cgroup_get_id(opc); + page_cgroup_set_id(npc, id); + unlock_page_cgroup(npc); + unlock_page_cgroup(opc); +} + +/* Create a new blkio-cgroup. */ +static struct cgroup_subsys_state * +blkio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct blkio_cgroup *biog; + struct io_context *ioc; + + if (!cgrp->parent) { + biog = &default_blkio_cgroup; + init_io_context(biog->io_context); + /* Increment the referrence count not to be released ever. */ + atomic_long_inc(&biog->io_context->refcount); + return &biog->css; + } + + biog = kzalloc(sizeof(*biog), GFP_KERNEL); + if (!biog) + return ERR_PTR(-ENOMEM); + ioc = alloc_io_context(GFP_KERNEL, -1); + if (!ioc) { + kfree(biog); + return ERR_PTR(-ENOMEM); + } + biog->io_context = ioc; + return &biog->css; +} + +/* Delete the blkio-cgroup. */ +static void blkio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct blkio_cgroup *biog = cgroup_blkio(cgrp); + + put_io_context(biog->io_context); + free_css_id(&blkio_cgroup_subsys, &biog->css); + kfree(biog); +} + +/** + * get_blkio_cgroup_id() - determine the blkio-cgroup ID + * @bio: the &struct bio which describes the I/O + * + * Returns the blkio-cgroup ID of a given bio. A return value zero + * means that the page associated with the bio belongs to default_blkio_cgroup. + */ +unsigned long get_blkio_cgroup_id(struct bio *bio) +{ + struct page_cgroup *pc; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; + unsigned long id = 0; + + pc = lookup_page_cgroup(page); + if (pc) { + lock_page_cgroup(pc); + id = page_cgroup_get_id(pc); + unlock_page_cgroup(pc); + } + return id; +} + +/** + * get_blkio_cgroup_id_page() - determine the blkio-cgroup ID + * @page: the &struct page which describes the I/O + * + * Returns the blkio-cgroup ID of a given page. A return value zero + * means that the page associated with the IO belongs to default_blkio_cgroup. + */ +unsigned long get_blkio_cgroup_id_page(struct page *page) +{ + struct page_cgroup *pc; + unsigned long id = 0; + + pc = lookup_page_cgroup(page); + if (pc) { + lock_page_cgroup(pc); + id = page_cgroup_get_id(pc); + unlock_page_cgroup(pc); + } + return id; +} + +/** + * get_blkio_cgroup_iocontext() - determine the blkio-cgroup iocontext + * @bio: the &struct bio which describe the I/O + * + * Returns the iocontext of blkio-cgroup that issued a given bio. + */ +struct io_context *get_blkio_cgroup_iocontext(struct bio *bio) +{ + struct cgroup_subsys_state *css; + struct blkio_cgroup *biog; + struct io_context *ioc; + unsigned long id; + + id = get_blkio_cgroup_id(bio); + rcu_read_lock(); + css = css_lookup(&blkio_cgroup_subsys, id); + if (css) + biog = container_of(css, struct blkio_cgroup, css); + else + biog = &default_blkio_cgroup; + ioc = biog->io_context; /* default io_context for this cgroup */ + atomic_long_inc(&ioc->refcount); + rcu_read_unlock(); + return ioc; +} + +/** + * blkio_cgroup_lookup() - lookup a cgroup by blkio-cgroup ID + * @id: blkio-cgroup ID + * + * Returns the cgroup associated with the specified ID, or NULL if lookup + * fails. + * + * Note: + * This function should be called under rcu_read_lock(). + */ +struct cgroup *blkio_cgroup_lookup(int id) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + + if (blkio_cgroup_disabled()) + return NULL; + + css = css_lookup(&blkio_cgroup_subsys, id); + if (!css) + return NULL; + cgrp = css->cgroup; + return cgrp; +} +EXPORT_SYMBOL(get_blkio_cgroup_iocontext); +EXPORT_SYMBOL(get_blkio_cgroup_id); +EXPORT_SYMBOL(blkio_cgroup_lookup); + +static u64 blkio_id_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct blkio_cgroup *biog = cgroup_blkio(cgrp); + unsigned long id; + + rcu_read_lock(); + id = css_id(&biog->css); + rcu_read_unlock(); + return (u64)id; +} + + +static struct cftype blkio_files[] = { + { + .name = "id", + .read_u64 = blkio_id_read, + }, +}; + +static int blkio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +struct cgroup_subsys blkio_cgroup_subsys = { + .name = "blkio", + .create = blkio_cgroup_create, + .destroy = blkio_cgroup_destroy, + .populate = blkio_cgroup_populate, + .subsys_id = blkio_cgroup_subsys_id, + .use_id = 1, +}; diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..422d89c 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -210,6 +211,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + blkio_cgroup_copy_owner(to->bv_page, page); if (rw == WRITE) { char *vto, *vfrom; diff --git a/mm/filemap.c b/mm/filemap.c index ccea3b6..01c47a1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include +#include #include /* for page_is_file_cache() */ #include "internal.h" @@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; + blkio_cgroup_set_owner(page, current->mm); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964..98c7d19 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -129,6 +129,12 @@ struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; }; +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc) +{ + pc->mem_cgroup = NULL; + INIT_LIST_HEAD(&pc->lru); +} + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide diff --git a/mm/memory.c b/mm/memory.c index 6521619..579991d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2115,6 +2116,7 @@ gotten: */ ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + blkio_cgroup_set_owner(new_page, mm); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + blkio_cgroup_reset_owner(page, mm); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ @@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (anon) { inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); + blkio_cgroup_set_owner(page, mm); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 81627eb..f924e05 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -984,6 +985,17 @@ retry: if (nr_pages == 0) break; + /* + * If the io group page will go into is congested, bail out. + */ + if (wbc->nonblocking + && bdi_write_congested_group(bdi, pvec.pages[0])) { + wbc->encountered_congestion = 1; + done = 1; + pagevec_release(&pvec); + break; + } + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1247,6 +1259,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); + blkio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4eb..2883bb7 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,14 +9,15 @@ #include #include #include +#include static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) { pc->flags = 0; - pc->mem_cgroup = NULL; pc->page = pfn_to_page(pfn); - INIT_LIST_HEAD(&pc->lru); + __init_mem_page_cgroup(pc); + __init_blkio_page_cgroup(pc); } static unsigned long total_usage; @@ -74,7 +75,7 @@ void __init page_cgroup_init_flatmem(void) int nid, fail; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && blkio_cgroup_disabled()) return; for_each_online_node(nid) { @@ -83,12 +84,12 @@ void __init page_cgroup_init_flatmem(void) goto fail; } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" + printk(KERN_INFO "please try cgroup_disable=memory,blkio option if you" " don't want memory cgroups\n"); return; fail: printk(KERN_CRIT "allocation of page_cgroup failed.\n"); - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); + printk(KERN_CRIT "please try cgroup_disable=memory,blkio boot options\n"); panic("Out of memory"); } @@ -245,7 +246,7 @@ void __init page_cgroup_init(void) unsigned long pfn; int fail = 0; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && blkio_cgroup_disabled()) return; for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { @@ -260,8 +261,8 @@ void __init page_cgroup_init(void) hotplug_memory_notifier(page_cgroup_callback, 0); } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" - " want memory cgroups\n"); + printk(KERN_INFO "please try cgroup_disable=memory,blkio option" + " if you don't want memory and io cgroups\n"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa23..22e0639 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -542,7 +542,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_or_group_read_congested(mapping->backing_dev_info, NULL)) return; /* do read-ahead */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38e..6eb96f1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -307,6 +308,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ __set_page_locked(new_page); SetPageSwapBacked(new_page); + blkio_cgroup_set_owner(new_page, current->mm); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /*