block copy: use two bios This patch changes the architecture of xcopy so that two bios are used. There used to be just one bio that held pointers to both source and destination block device. However a bio with two block devices cannot really be passed through block midlayer drivers (dm and md). When we need to send the XCOPY command, we call the function blkdev_issue_copy. This function creates two bios, the first with bi_rw REQ_OP_COPY_READ and the second REQ_OP_COPY_WRITE. The bios have a pointer to a common bi_copy structure. These bios travel independently through the block device stack. When both the bios reach the physical disk driver (the function blk_queue_bio), they are paired, a request is made and the request is sent to the SCSI disk driver. It is possible that one of the bios reaches a device that doesn't support XCOPY, in that case both bios are aborted with an error. Note that there is no guarantee that the XCOPY command will succeed. If it doesn't succeed, the caller is supposed to perform the copy manually. Signed-off-by: Mikulas Patocka --- block/bio.c | 29 +++++++++++++- block/blk-core.c | 45 ++++++++++++++++++++++ block/blk-lib.c | 92 ++++++++++++++++++++++++++++++++-------------- block/blk-merge.c | 6 ++- drivers/scsi/sd.c | 12 ++---- include/linux/bio.h | 12 ++++-- include/linux/blk_types.h | 19 +++++++-- 7 files changed, 168 insertions(+), 47 deletions(-) Index: linux-4.11-rc2/block/blk-lib.c =================================================================== --- linux-4.11-rc2.orig/block/blk-lib.c +++ linux-4.11-rc2/block/blk-lib.c @@ -15,17 +15,6 @@ struct bio_batch { struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio) -{ - struct bio_batch *bb = bio->bi_private; - - if (bio->bi_error && bio->bi_error != -EOPNOTSUPP) - bb->error = bio->bi_error; - if (atomic_dec_and_test(&bb->done)) - complete(bb->wait); - bio_put(bio); -} - static struct bio *next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) { @@ -398,6 +387,38 @@ int blkdev_issue_zeroout(struct block_de } EXPORT_SYMBOL(blkdev_issue_zeroout); +static void bio_copy_end_io(struct bio *bio) +{ + struct bio_copy *bc = bio->bi_copy; + if (unlikely(bio->bi_error)) { + unsigned long flags; + int dir; + struct bio *other; + + /* if the other bio is waiting for the pair, release it */ + spin_lock_irqsave(&bc->spinlock, flags); + if (bc->error >= 0) + bc->error = bio->bi_error; + dir = bio_data_dir(bio); + other = bc->pair[dir ^ 1]; + bc->pair[dir ^ 1] = NULL; + spin_unlock_irqrestore(&bc->spinlock, flags); + if (other) { + other->bi_error = bio->bi_error; + bio_endio(other); + } + } + bio_put(bio); + if (atomic_dec_and_test(&bc->in_flight)) { + struct bio_batch *bb = bc->private; + if (unlikely(bc->error < 0) && !ACCESS_ONCE(bb->error)) + ACCESS_ONCE(bb->error) = bc->error; + kfree(bc); + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); + } +} + /** * blkdev_issue_copy - queue a copy same operation * @src_bdev: source blockdev @@ -444,9 +465,9 @@ int blkdev_issue_copy(struct block_devic bb.wait = &wait; while (nr_sects) { - struct bio *bio; + struct bio *read_bio, *write_bio; struct bio_copy *bc; - unsigned int chunk; + unsigned int chunk = min(nr_sects, max_copy_sectors); bc = kmalloc(sizeof(struct bio_copy), gfp_mask); if (!bc) { @@ -454,28 +475,45 @@ int blkdev_issue_copy(struct block_devic break; } - bio = bio_alloc(gfp_mask, 0); - if (!bio) { + read_bio = bio_alloc(gfp_mask, 1); + if (!read_bio) { kfree(bc); ret = -ENOMEM; break; } - chunk = min(nr_sects, max_copy_sectors); - - bio_set_op_attrs(bio, REQ_OP_COPY, 0); - bio->bi_iter.bi_sector = dst_sector; - bio->bi_iter.bi_size = chunk << 9; - bio->bi_end_io = bio_batch_end_io; - bio->bi_bdev = dst_bdev; - bio->bi_private = &bb; - bio->bi_copy = bc; + write_bio = bio_alloc(gfp_mask, 1); + if (!write_bio) { + bio_put(read_bio); + kfree(bc); + ret = -ENOMEM; + break; + } - bc->bic_bdev = src_bdev; - bc->bic_sector = src_sector; + atomic_set(&bc->in_flight, 2); + bc->error = 1; + bc->pair[0] = NULL; + bc->pair[1] = NULL; + bc->private = &bb; + spin_lock_init(&bc->spinlock); + + bio_set_op_attrs(read_bio, REQ_OP_COPY_READ, 0); + read_bio->bi_iter.bi_sector = src_sector; + read_bio->bi_iter.bi_size = chunk << 9; + read_bio->bi_end_io = bio_copy_end_io; + read_bio->bi_bdev = src_bdev; + read_bio->bi_copy = bc; + + bio_set_op_attrs(write_bio, REQ_OP_COPY_WRITE, 0); + write_bio->bi_iter.bi_sector = dst_sector; + write_bio->bi_iter.bi_size = chunk << 9; + write_bio->bi_end_io = bio_copy_end_io; + write_bio->bi_bdev = dst_bdev; + write_bio->bi_copy = bc; atomic_inc(&bb.done); - submit_bio(bio); + submit_bio(read_bio); + submit_bio(write_bio); src_sector += chunk; dst_sector += chunk; Index: linux-4.11-rc2/include/linux/blk_types.h =================================================================== --- linux-4.11-rc2.orig/include/linux/blk_types.h +++ linux-4.11-rc2/include/linux/blk_types.h @@ -7,6 +7,7 @@ #include #include +#include struct bio_set; struct bio; @@ -91,8 +92,16 @@ struct bio { #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) struct bio_copy { - struct block_device *bic_bdev; - sector_t bic_sector; + /* + * error == 1 - bios are waiting to be paired + * error == 0 - pair was issued + * error < 0 - error + */ + int error; + atomic_t in_flight; + struct bio *pair[2]; + void *private; + spinlock_t spinlock; }; /* @@ -167,8 +176,10 @@ enum req_opf { REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 8, - /* copy offload */ - REQ_OP_COPY = 9, + /* copy offload, write bio */ + REQ_OP_COPY_WRITE = 9, + /* copy offload, read bio */ + REQ_OP_COPY_READ = 10, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, Index: linux-4.11-rc2/block/bio.c =================================================================== --- linux-4.11-rc2.orig/block/bio.c +++ linux-4.11-rc2/block/bio.c @@ -243,8 +243,6 @@ static void __bio_free(struct bio *bio) { bio_disassociate_task(bio); - kfree(bio->bi_copy); - if (bio_integrity(bio)) bio_integrity_free(bio); } @@ -588,6 +586,7 @@ void __bio_clone_fast(struct bio *bio, s bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; bio->bi_iter = bio_src->bi_iter; + bio->bi_copy = bio_src->bi_copy; bio->bi_io_vec = bio_src->bi_io_vec; bio_clone_blkcg_association(bio, bio_src); @@ -1812,6 +1811,28 @@ static inline bool bio_remaining_done(st return false; } +static noinline_for_stack void bio_endio_copy(struct bio *bio) +{ + struct bio_copy *bc = bio->bi_copy; + struct bio *other = NULL; + unsigned long flags; + int dir; + + spin_lock_irqsave(&bc->spinlock, flags); + dir = bio_data_dir(bio); + if (bc->pair[dir]) { + BUG_ON(bc->pair[dir] != bio); + other = bc->pair[dir ^ 1]; + bc->pair[0] = bc->pair[1] = NULL; + } + spin_unlock_irqrestore(&bc->spinlock, flags); + + if (other) { + other->bi_error = bio->bi_error; + bio_endio(other); + } +} + /** * bio_endio - end I/O on a bio * @bio: bio @@ -1827,6 +1848,10 @@ again: if (!bio_remaining_done(bio)) return; + if (unlikely(bio_op(bio) == REQ_OP_COPY_READ) || + unlikely(bio_op(bio) == REQ_OP_COPY_WRITE)) + bio_endio_copy(bio); + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that Index: linux-4.11-rc2/block/blk-core.c =================================================================== --- linux-4.11-rc2.orig/block/blk-core.c +++ linux-4.11-rc2/block/blk-core.c @@ -1620,6 +1620,33 @@ void init_request_from_bio(struct reques blk_rq_bio_prep(req->q, req, bio); } +static noinline_for_stack struct bio *blk_queue_copy(struct bio *bio) +{ + struct bio_copy *bc = bio->bi_copy; + int dir, error; + struct bio *ret; + + spin_lock_irq(&bc->spinlock); + error = bc->error; + if (unlikely(error < 0)) { + spin_unlock_irq(&bc->spinlock); + bio->bi_error = error; + bio_endio(bio); + return NULL; + } + dir = bio_data_dir(bio); + bc->pair[dir] = bio; + if (bc->pair[dir ^ 1]) { + ret = bc->pair[1]; + bc->error = 0; + } else { + ret = NULL; + } + spin_unlock_irq(&bc->spinlock); + + return ret; +} + static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { struct blk_plug *plug; @@ -1687,6 +1714,21 @@ static blk_qc_t blk_queue_bio(struct req } get_rq: + if (unlikely(bio_op(bio) == REQ_OP_COPY_READ) || + unlikely(bio_op(bio) == REQ_OP_COPY_WRITE)) { + if (unlikely(bio_sectors(bio) > q->limits.max_copy_sectors)) { + spin_unlock_irq(q->queue_lock); + bio->bi_error = -EOPNOTSUPP; + bio_endio(bio); + return BLK_QC_T_NONE; + } + spin_unlock_irq(q->queue_lock); + bio = blk_queue_copy(bio); + if (!bio) + return BLK_QC_T_NONE; + spin_lock_irq(q->queue_lock); + } + wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); /* @@ -1921,7 +1963,8 @@ generic_make_request_checks(struct bio * if (!bdev_write_zeroes_sectors(bio->bi_bdev)) goto not_supported; break; - case REQ_OP_COPY: + case REQ_OP_COPY_READ: + case REQ_OP_COPY_WRITE: if (!bdev_copy_offload(bio->bi_bdev)) goto not_supported; break; Index: linux-4.11-rc2/drivers/scsi/sd.c =================================================================== --- linux-4.11-rc2.orig/drivers/scsi/sd.c +++ linux-4.11-rc2/drivers/scsi/sd.c @@ -959,12 +959,9 @@ static int sd_setup_copy_cmnd(struct scs struct page *page; unsigned char *buf; - if (!bio->bi_copy) - return BLKPREP_KILL; - dst_sdp = scsi_disk(rq->rq_disk)->device; dst_queue = rq->rq_disk->queue; - src_disk = bio->bi_copy->bic_bdev->bd_disk; + src_disk = bio->bi_copy->pair[0]->bi_bdev->bd_disk; src_queue = src_disk->queue; if (!src_queue || src_queue->make_request_fn != dst_queue->make_request_fn || @@ -981,7 +978,7 @@ static int sd_setup_copy_cmnd(struct scs return BLKPREP_KILL; dst_lba = blk_rq_pos(rq) >> (ilog2(dst_sdp->sector_size) - 9); - src_lba = bio->bi_copy->bic_sector >> (ilog2(src_sdp->sector_size) - 9); + src_lba = bio->bi_copy->pair[0]->bi_iter.bi_sector >> (ilog2(src_sdp->sector_size) - 9); nr_blocks = blk_rq_sectors(rq) >> (ilog2(dst_sdp->sector_size) - 9); page = alloc_page(GFP_ATOMIC | __GFP_ZERO); @@ -1319,7 +1316,7 @@ static int sd_init_command(struct scsi_c case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); - case REQ_OP_COPY: + case REQ_OP_COPY_WRITE: return sd_setup_copy_cmnd(cmd); case REQ_OP_ZONE_REPORT: return sd_zbc_setup_report_cmnd(cmd); @@ -1952,7 +1949,8 @@ static int sd_done(struct scsi_cmnd *SCp switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_SAME: - case REQ_OP_COPY: + case REQ_OP_COPY_READ: + case REQ_OP_COPY_WRITE: case REQ_OP_ZONE_RESET: if (!result) { good_bytes = blk_rq_bytes(req); Index: linux-4.11-rc2/include/linux/bio.h =================================================================== --- linux-4.11-rc2.orig/include/linux/bio.h +++ linux-4.11-rc2/include/linux/bio.h @@ -78,7 +78,8 @@ static inline bool bio_has_data(struct b bio_op(bio) != REQ_OP_DISCARD && bio_op(bio) != REQ_OP_SECURE_ERASE && bio_op(bio) != REQ_OP_WRITE_ZEROES && - bio_op(bio) != REQ_OP_COPY) + bio_op(bio) != REQ_OP_COPY_READ && + bio_op(bio) != REQ_OP_COPY_WRITE) return true; return false; @@ -90,12 +91,14 @@ static inline bool bio_no_advance_iter(s bio_op(bio) == REQ_OP_SECURE_ERASE || bio_op(bio) == REQ_OP_WRITE_SAME || bio_op(bio) == REQ_OP_WRITE_ZEROES || - bio_op(bio) == REQ_OP_COPY; + bio_op(bio) == REQ_OP_COPY_READ || + bio_op(bio) == REQ_OP_COPY_WRITE; } static inline bool bio_mergeable(struct bio *bio) { - if (bio_op(bio) == REQ_OP_COPY) + if (bio_op(bio) == REQ_OP_COPY_READ || + bio_op(bio) == REQ_OP_COPY_WRITE) return false; if (bio->bi_opf & REQ_NOMERGE_FLAGS) return false; @@ -204,7 +207,8 @@ static inline unsigned __bio_segments(st case REQ_OP_WRITE_ZEROES: return 0; case REQ_OP_WRITE_SAME: - case REQ_OP_COPY: + case REQ_OP_COPY_READ: + case REQ_OP_COPY_WRITE: return 1; default: break; Index: linux-4.11-rc2/block/blk-merge.c =================================================================== --- linux-4.11-rc2.orig/block/blk-merge.c +++ linux-4.11-rc2/block/blk-merge.c @@ -206,7 +206,8 @@ void blk_queue_split(struct request_queu case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); break; - case REQ_OP_COPY: + case REQ_OP_COPY_READ: + case REQ_OP_COPY_WRITE: return; default: split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); @@ -247,7 +248,8 @@ static unsigned int __blk_recalc_rq_segm case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: - case REQ_OP_COPY: + case REQ_OP_COPY_READ: + case REQ_OP_COPY_WRITE: return 0; case REQ_OP_WRITE_SAME: return 1;