DM-Devel Archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far
@ 2019-02-20 21:44 Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 1/9] dm: update dm_process_bio() to split bio if in ->make_request_fn() Mike Snitzer
                   ` (8 more replies)
  0 siblings, 9 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

This v2 eliminates the dm-mpath local cache for per-bio-data and
switches to allocating the target required per-bio-data as part of the
'struct dm_noclone' allocation -- special thanks to Mikulas for his
ideas and patch (#6) that went into this v2.

Test results show a slight improvement for bio-based dm-multipath (but
not the kind of win dm-linear gets from switching to noclone):

   linear: raw=6087MB/s clone=4241MB/s noclone=5148MB/s
   mpath:  raw=6087MB/s clone=3331MB/s noclone=3420MB/s

More work is needed to improve DM multipath performance (story of my
life)... but at least this shifts the focus away from DM core and back
to the dm-multipath target code.

Mike Snitzer (6):
  dm: update dm_process_bio() to split bio if in ->make_request_fn()
  dm: eliminate 'split_discard_bios' flag from DM target interface
  dm: improve noclone bio support
  dm: improve noclone_endio() to support multipath target
  dm mpath: enable noclone support for bio-based
  dm: remove unused _rq_tio_cache and _rq_cache

Mikulas Patocka (3):
  dm: refactor start_io_acct and end_io_acct
  dm: implement noclone optimization for bio-based
  dm: add per-bio-data support to noclone bio

 drivers/md/dm-cache-target.c  |   1 -
 drivers/md/dm-linear.c        |   3 +-
 drivers/md/dm-mpath.c         |  13 +-
 drivers/md/dm-raid.c          |  14 +-
 drivers/md/dm-rq.c            |  16 ++
 drivers/md/dm-rq.h            |  16 --
 drivers/md/dm-stripe.c        |   3 +-
 drivers/md/dm-table.c         |  11 ++
 drivers/md/dm-thin.c          |   1 -
 drivers/md/dm-zero.c          |   1 +
 drivers/md/dm-zoned-target.c  |   1 -
 drivers/md/dm.c               | 406 ++++++++++++++++++++++++++++--------------
 drivers/md/dm.h               |   1 +
 include/linux/device-mapper.h |   9 +-
 include/uapi/linux/dm-ioctl.h |   4 +-
 15 files changed, 330 insertions(+), 170 deletions(-)

-- 
2.15.0

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/9] dm: update dm_process_bio() to split bio if in ->make_request_fn()
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface Mike Snitzer
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

Must call blk_queue_split() otherwise queue_limits for abnormal requests
(e.g. discard, writesame, etc) won't be imposed.

In addition, add dm_queue_split() to simplify DM specific splitting that
is needed for targets that impose ti->max_io_len.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 93 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 27 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 515e6af9bed2..7a774fcd0194 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1533,6 +1533,22 @@ static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
 	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
 }
 
+static bool is_abnormal_io(struct bio *bio)
+{
+	bool r = false;
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE_ZEROES:
+		r = true;
+		break;
+	}
+
+	return r;
+}
+
 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
 				  int *result)
 {
@@ -1565,7 +1581,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 
-	if (unlikely(__process_abnormal_io(ci, ti, &r)))
+	if (__process_abnormal_io(ci, ti, &r))
 		return r;
 
 	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
@@ -1601,13 +1617,6 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 	blk_qc_t ret = BLK_QC_T_NONE;
 	int error = 0;
 
-	if (unlikely(!map)) {
-		bio_io_error(bio);
-		return ret;
-	}
-
-	blk_queue_split(md->queue, &bio);
-
 	init_clone_info(&ci, md, map, bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
@@ -1675,18 +1684,13 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
  * Optimized variant of __split_and_process_bio that leverages the
  * fact that targets that use it do _not_ have a need to split bios.
  */
-static blk_qc_t __process_bio(struct mapped_device *md,
-			      struct dm_table *map, struct bio *bio)
+static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
+			      struct bio *bio, struct dm_target *ti)
 {
 	struct clone_info ci;
 	blk_qc_t ret = BLK_QC_T_NONE;
 	int error = 0;
 
-	if (unlikely(!map)) {
-		bio_io_error(bio);
-		return ret;
-	}
-
 	init_clone_info(&ci, md, map, bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
@@ -1704,21 +1708,11 @@ static blk_qc_t __process_bio(struct mapped_device *md,
 		error = __send_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
 	} else {
-		struct dm_target *ti = md->immutable_target;
 		struct dm_target_io *tio;
 
-		/*
-		 * Defend against IO still getting in during teardown
-		 * - as was seen for a time with nvme-fcloop
-		 */
-		if (WARN_ON_ONCE(!ti || !dm_target_is_valid(ti))) {
-			error = -EIO;
-			goto out;
-		}
-
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
-		if (unlikely(__process_abnormal_io(&ci, ti, &error)))
+		if (__process_abnormal_io(&ci, ti, &error))
 			goto out;
 
 		tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
@@ -1730,11 +1724,56 @@ static blk_qc_t __process_bio(struct mapped_device *md,
 	return ret;
 }
 
+static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
+{
+	unsigned len, sector_count;
+
+	sector_count = bio_sectors(*bio);
+	len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
+
+	if (sector_count > len) {
+		struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
+
+		bio_chain(split, *bio);
+		trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
+		generic_make_request(*bio);
+		*bio = split;
+	}
+}
+
 static blk_qc_t dm_process_bio(struct mapped_device *md,
 			       struct dm_table *map, struct bio *bio)
 {
+	blk_qc_t ret = BLK_QC_T_NONE;
+	struct dm_target *ti = md->immutable_target;
+
+	if (unlikely(!map)) {
+		bio_io_error(bio);
+		return ret;
+	}
+
+	if (!ti) {
+		ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
+		if (unlikely(!ti || !dm_target_is_valid(ti))) {
+			bio_io_error(bio);
+			return ret;
+		}
+	}
+
+	/*
+	 * If in ->make_request_fn we need to use blk_queue_split(), otherwise
+	 * queue_limits for abnormal requests (e.g. discard, writesame, etc)
+	 * won't be imposed.
+	 */
+	if (current->bio_list) {
+		if (is_abnormal_io(bio))
+			blk_queue_split(md->queue, &bio);
+		else
+			dm_queue_split(md, ti, &bio);
+	}
+
 	if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
-		return __process_bio(md, map, bio);
+		return __process_bio(md, map, bio, ti);
 	else
 		return __split_and_process_bio(md, map, bio);
 }
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 1/9] dm: update dm_process_bio() to split bio if in ->make_request_fn() Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-21  4:36   ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 3/9] dm: refactor start_io_acct and end_io_acct Mike Snitzer
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

There is no need to have DM core split discards on behalf of a DM target
now that blk_queue_split() handles splitting discards based on the
queue_limits.  A DM target just needs to set max_discard_sectors,
discard_granularity, etc, in queue_limits.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c  |  1 -
 drivers/md/dm-raid.c          | 14 +++++++++-----
 drivers/md/dm-thin.c          |  1 -
 drivers/md/dm-zoned-target.c  |  1 -
 drivers/md/dm.c               | 28 ++++++----------------------
 include/linux/device-mapper.h |  6 ------
 include/uapi/linux/dm-ioctl.h |  4 ++--
 7 files changed, 17 insertions(+), 38 deletions(-)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index b29a8327eed1..adc529f12b6b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2496,7 +2496,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->split_discard_bios = false;
 
 	ti->per_io_data_size = sizeof(struct per_bio_data);
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index adcfe8ae10aa..9fdef6897316 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs)
 		}
 	}
 
-	/*
-	 * RAID1 and RAID10 personalities require bio splitting,
-	 * RAID0/4/5/6 don't and process large discard bios properly.
-	 */
-	ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
 	ti->num_discard_bios = 1;
 }
 
@@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
+
+	/*
+	 * RAID1 and RAID10 personalities require bio splitting,
+	 * RAID0/4/5/6 don't and process large discard bios properly.
+	 */
+	if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+		limits->discard_granularity = chunk_size;
+		limits->max_discard_sectors = chunk_size;
+	}
 }
 
 static void raid_postsuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e83b63608262..0d9ded0f5e50 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4227,7 +4227,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		ti->split_discard_bios = false;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6af5babe6837..8865c1709e16 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dmz_bioctx);
 	ti->flush_supported = true;
 	ti->discards_supported = true;
-	ti->split_discard_bios = true;
 
 	/* The exposed capacity is the number of chunks that can be mapped */
 	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7a774fcd0194..b988e178a523 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
 	return ti->num_write_zeroes_bios;
 }
 
-typedef bool (*is_split_required_fn)(struct dm_target *ti);
-
-static bool is_split_required_for_discard(struct dm_target *ti)
-{
-	return ti->split_discard_bios;
-}
-
 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
-				       unsigned num_bios, bool is_split_required)
+				       unsigned num_bios)
 {
-	unsigned len;
+	unsigned len = ci->sector_count;
 
 	/*
 	 * Even though the device advertised support for this type of
@@ -1499,38 +1492,29 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
 	if (!num_bios)
 		return -EOPNOTSUPP;
 
-	if (!is_split_required)
-		len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
-	else
-		len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
-
 	__send_duplicate_bios(ci, ti, num_bios, &len);
 
-	ci->sector += len;
-	ci->sector_count -= len;
-
 	return 0;
 }
 
 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti),
-					   is_split_required_for_discard(ti));
+	return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
 }
 
 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
 }
 
 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
 }
 
 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
 {
-	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
+	return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
 }
 
 static bool is_abnormal_io(struct bio *bio)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e528baebad69..0f5b3d7c6cb3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -315,12 +315,6 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	bool discards_supported:1;
-
-	/*
-	 * Set if the target required discard bios to be split
-	 * on max_io_len boundary.
-	 */
-	bool split_discard_bios:1;
 };
 
 /* Each target can link one of these into the table */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index d1e49514977b..f396a82dfd3e 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -270,9 +270,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	39
+#define DM_VERSION_MINOR	40
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2018-04-03)"
+#define DM_VERSION_EXTRA	"-ioctl (2019-01-18)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 3/9] dm: refactor start_io_acct and end_io_acct
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 1/9] dm: update dm_process_bio() to split bio if in ->make_request_fn() Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 4/9] dm: implement noclone optimization for bio-based Mike Snitzer
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

This refactoring of start_io_acct() and end_io_acct() eliminates their
dependency on 'struct dm_io' which will not be created for all IO
(following commit introduces the ability for targets to avoid cloning
bios for READ or WRITE bios).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 68 +++++++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b988e178a523..1b87d20041e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -580,7 +580,19 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 	return r;
 }
 
-static void start_io_acct(struct dm_io *io);
+static void start_io_acct(struct mapped_device *md, struct bio *bio)
+{
+	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), &dm_disk(md)->part0);
+}
+
+static void end_io_acct(struct mapped_device *md, struct bio *bio, unsigned long start_time)
+{
+	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, start_time);
+
+	/* nudge anyone waiting on suspend queue */
+	if (unlikely(wq_has_sleeper(&md->wait)))
+		wake_up(&md->wait);
+}
 
 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 {
@@ -604,7 +616,14 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	io->md = md;
 	spin_lock_init(&io->endio_lock);
 
-	start_io_acct(io);
+	io->start_time = jiffies;
+
+	start_io_acct(md, bio);
+
+	if (unlikely(dm_stats_used(&md->stats)))
+		dm_stats_account_io(&md->stats, bio_data_dir(bio),
+				    bio->bi_iter.bi_sector, bio_sectors(bio),
+				    false, 0, &io->stats_aux);
 
 	return io;
 }
@@ -668,41 +687,6 @@ static bool md_in_flight(struct mapped_device *md)
 		return md_in_flight_bios(md);
 }
 
-static void start_io_acct(struct dm_io *io)
-{
-	struct mapped_device *md = io->md;
-	struct bio *bio = io->orig_bio;
-
-	io->start_time = jiffies;
-
-	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
-			      &dm_disk(md)->part0);
-
-	if (unlikely(dm_stats_used(&md->stats)))
-		dm_stats_account_io(&md->stats, bio_data_dir(bio),
-				    bio->bi_iter.bi_sector, bio_sectors(bio),
-				    false, 0, &io->stats_aux);
-}
-
-static void end_io_acct(struct dm_io *io)
-{
-	struct mapped_device *md = io->md;
-	struct bio *bio = io->orig_bio;
-	unsigned long duration = jiffies - io->start_time;
-
-	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
-			    io->start_time);
-
-	if (unlikely(dm_stats_used(&md->stats)))
-		dm_stats_account_io(&md->stats, bio_data_dir(bio),
-				    bio->bi_iter.bi_sector, bio_sectors(bio),
-				    true, duration, &io->stats_aux);
-
-	/* nudge anyone waiting on suspend queue */
-	if (unlikely(wq_has_sleeper(&md->wait)))
-		wake_up(&md->wait);
-}
-
 /*
  * Add the bio to the list of deferred io.
  */
@@ -941,7 +925,15 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 
 		io_error = io->status;
 		bio = io->orig_bio;
-		end_io_acct(io);
+
+		if (unlikely(dm_stats_used(&md->stats))) {
+			unsigned long duration = jiffies - io->start_time;
+			dm_stats_account_io(&md->stats, bio_data_dir(bio),
+					    bio->bi_iter.bi_sector, bio_sectors(bio),
+					    true, duration, &io->stats_aux);
+		}
+
+		end_io_acct(md, bio, io->start_time);
 		free_io(md, io);
 
 		if (io_error == BLK_STS_DM_REQUEUE)
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 4/9] dm: implement noclone optimization for bio-based
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (2 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 3/9] dm: refactor start_io_acct and end_io_acct Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 5/9] dm: improve noclone bio support Mike Snitzer
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Until now bio-based DM core has always cloned an incoming bio, remapped
the clone bio to a low layer, dealt with the clone's completion and then
finally completed the original bio.  This cloning can be avoided for
READ and WRITE bios if the target opts-in by setting ti->no_clone.

Avoiding cloning for READ and WRITE bios improves performance of targets
that do very little work in response to each bio (e.g. dm-linear and
dm-striped).

The improvement is accomplished by changing DM core to allocate a
'dm_noclone' structure (that is quite small) instead of cloning the bio.
The bio's bi_end_io and bi_private are saved in the 'dm_noclone' before
they are overwritten and the bio passed to the lower block device.

When the bio is finished, the function noclone_endio restores the values
bi_end_io and bi_private and passes the bio to the original bi_end_io
function.

If the allocation of the 'struct dm_noclone' fails then bio-based DM
falls back to the traditional bio cloning IO path that is backed by
mempool reservations.

Performance improvement for dm-linear:

x86-64, 2x six-core
/dev/ram0					2449MiB/s
/dev/mapper/lin 5.0-rc without optimization	1970MiB/s
/dev/mapper/lin 5.0-rc with optimization	2238MiB/s

arm64, quad core:
/dev/ram0					457MiB/s
/dev/mapper/lin 5.0-rc without optimization	325MiB/s
/dev/mapper/lin 5.0-rc with optimization	364MiB/s

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-linear.c        |  3 +-
 drivers/md/dm-stripe.c        |  3 +-
 drivers/md/dm-table.c         | 11 ++++++++
 drivers/md/dm-zero.c          |  1 +
 drivers/md/dm.c               | 64 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.h               |  1 +
 include/linux/device-mapper.h |  9 ++++++
 7 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..573ee0c5e83a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -62,6 +62,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_secure_erase_bios = 1;
 	ti->num_write_same_bios = 1;
 	ti->num_write_zeroes_bios = 1;
+	ti->no_clone = true;
 	ti->private = lc;
 	return 0;
 
@@ -216,7 +217,7 @@ static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
 
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 #ifdef CONFIG_BLK_DEV_ZONED
 	.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
 	.report_zones = linear_report_zones,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 8547d7594338..0081bfe03e64 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -172,6 +172,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_secure_erase_bios = stripes;
 	ti->num_write_same_bios = stripes;
 	ti->num_write_zeroes_bios = stripes;
+	ti->no_clone = true;
 
 	sc->chunk_size = chunk_size;
 	if (chunk_size & (chunk_size - 1))
@@ -486,7 +487,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..6a3e23faeb7d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -47,6 +47,7 @@ struct dm_table {
 
 	bool integrity_supported:1;
 	bool singleton:1;
+	bool no_clone:1;
 	unsigned integrity_added:1;
 
 	/*
@@ -191,6 +192,8 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 	if (!t)
 		return -ENOMEM;
 
+	t->no_clone = true;
+
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
 
@@ -789,6 +792,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	if (r)
 		goto bad;
 
+	if (!tgt->no_clone)
+		t->no_clone = false;
+
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
 	if (!tgt->num_discard_bios && tgt->discards_supported)
@@ -1376,6 +1382,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
 	return 0;
 }
 
+bool dm_table_supports_noclone(struct dm_table *table)
+{
+	return table->no_clone;
+}
+
 /*
  * Check whether a table has no data devices attached using each
  * target's iterate_devices method.
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index b65ca8dcfbdc..436a5ee89698 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -26,6 +26,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	 * Silently drop discards, avoiding -EOPNOTSUPP.
 	 */
 	ti->num_discard_bios = 1;
+	ti->no_clone = true;
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b87d20041e7..57919f211acc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -102,6 +102,16 @@ struct dm_io {
 	struct dm_target_io tio;
 };
 
+/*
+ * One of these is allocated per noclone bio.
+ */
+struct dm_noclone {
+	struct mapped_device *md;
+	bio_end_io_t *orig_bi_end_io;
+	void *orig_bi_private;
+	unsigned long start_time;
+};
+
 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 {
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
@@ -1009,6 +1019,20 @@ static void clone_endio(struct bio *bio)
 	dec_pending(io, error);
 }
 
+static void noclone_endio(struct bio *bio)
+{
+	struct dm_noclone *noclone = bio->bi_private;
+	struct mapped_device *md = noclone->md;
+
+	end_io_acct(md, bio, noclone->start_time);
+
+	bio->bi_end_io = noclone->orig_bi_end_io;
+	bio->bi_private = noclone->orig_bi_private;
+	kfree(noclone);
+
+	bio_endio(bio);
+}
+
 /*
  * Return maximum size of I/O possible at the supplied sector up to the current
  * target boundary.
@@ -1774,8 +1798,48 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 		return ret;
 	}
 
+	if (dm_table_supports_noclone(map) &&
+	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
+	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
+	    !bio_flagged(bio, BIO_CHAIN) &&
+	    likely(!bio_integrity(bio)) &&
+	    likely(!dm_stats_used(&md->stats))) {
+		int r;
+		struct dm_noclone *noclone;
+		struct dm_target *ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
+		if (unlikely(!dm_target_is_valid(ti)))
+			goto no_fast_path;
+		if (unlikely(bio_sectors(bio) > max_io_len(bio->bi_iter.bi_sector, ti)))
+			goto no_fast_path;
+		noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
+		if (unlikely(!noclone))
+			goto no_fast_path;
+		noclone->md = md;
+		noclone->start_time = jiffies;
+		noclone->orig_bi_end_io = bio->bi_end_io;
+		noclone->orig_bi_private = bio->bi_private;
+		bio->bi_end_io = noclone_endio;
+		bio->bi_private = noclone;
+		start_io_acct(md, bio);
+		r = ti->type->map(ti, bio);
+		ret = BLK_QC_T_NONE;
+		if (likely(r == DM_MAPIO_REMAPPED)) {
+			ret = generic_make_request(bio);
+		} else if (likely(r == DM_MAPIO_SUBMITTED)) {
+		} else if (r == DM_MAPIO_KILL) {
+			bio->bi_status = BLK_STS_IOERR;
+			noclone_endio(bio);
+		} else {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+		goto put_table_ret;
+	}
+
+no_fast_path:
 	ret = dm_process_bio(md, map, bio);
 
+put_table_ret:
 	dm_put_live_table(md, srcu_idx);
 	return ret;
 }
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 2d539b82ec08..c3c78123dfb3 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -53,6 +53,7 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+bool dm_table_supports_noclone(struct dm_table *t);
 bool dm_table_has_no_data_devices(struct dm_table *table);
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0f5b3d7c6cb3..d38306476c0b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -315,6 +315,15 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	bool discards_supported:1;
+
+	/*
+	 * Set if this target can process bios without cloning them.
+	 * The target's per bio processing must be fast enough that DM core's
+	 * cloning is not dwarfed by per-bio work in the target.
+	 * This also implies the target is sufficiently simple so as not to
+	 * require complex block capabilities (e.g. integrity, cloning, etc).
+	 */
+	bool no_clone:1;
 };
 
 /* Each target can link one of these into the table */
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 5/9] dm: improve noclone bio support
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (3 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 4/9] dm: implement noclone optimization for bio-based Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-22 10:59   ` Mikulas Patocka
  2019-02-20 21:44 ` [PATCH v2 6/9] dm: add per-bio-data support to noclone bio Mike Snitzer
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

Leverage fact that dm_queue_split() enables use of noclone support even
for targets that require additional splitting (via ti->max_io_len).

To achieve this move noclone processing from dm_make_request() to
dm_process_bio() and check if bio->bi_end_io is already set to
noclone_endio.  This also fixes dm_wq_work() to be able to support
noclone bios (because it calls dm_process_bio()).

While at it, clean up ->map() return processing to be comparable to
__map_bio() and add some code comments that explain why certain
conditionals exist to prevent the use of noclone.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 103 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 57919f211acc..d84735be6927 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1764,14 +1764,75 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
 	 * If in ->make_request_fn we need to use blk_queue_split(), otherwise
 	 * queue_limits for abnormal requests (e.g. discard, writesame, etc)
 	 * won't be imposed.
+	 *
+	 * For normal READ and WRITE requests we benefit from upfront splitting
+	 * via dm_queue_split() because we can then leverage noclone support below.
 	 */
-	if (current->bio_list) {
+	if (current->bio_list && (bio->bi_end_io != noclone_endio)) {
+		/*
+		 * It is fine to use {blk,dm}_queue_split() before opting to use noclone
+		 * because any ->bi_private and ->bi_end_io will get saved off below.
+		 * Once noclone_endio() is called the bio_chain's endio will kick in.
+		 * - __split_and_process_bio() can split further, but any targets that
+		 *   require late _DM_ initiated splitting (e.g. dm_accept_partial_bio()
+		 *   callers) shouldn't set ti->no_clone.
+		 */
 		if (is_abnormal_io(bio))
 			blk_queue_split(md->queue, &bio);
 		else
 			dm_queue_split(md, ti, &bio);
 	}
 
+	if (dm_table_supports_noclone(map) &&
+	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
+	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
+	    likely(!bio_integrity(bio)) && /* integrity requires specialized processing */
+	    likely(!dm_stats_used(&md->stats))) { /* noclone doesn't support dm-stats */
+		int r;
+		struct dm_noclone *noclone;
+
+		/* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
+		if (bio->bi_end_io != noclone_endio) {
+			noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
+			if (unlikely(!noclone))
+				goto no_fast_path;
+
+			noclone->md = md;
+			noclone->start_time = jiffies;
+			noclone->orig_bi_end_io = bio->bi_end_io;
+			noclone->orig_bi_private = bio->bi_private;
+			bio->bi_end_io = noclone_endio;
+			bio->bi_private = noclone;
+		} else
+			noclone = bio->bi_private;
+
+		start_io_acct(md, bio);
+		r = ti->type->map(ti, bio);
+		switch (r) {
+		case DM_MAPIO_SUBMITTED:
+			break;
+		case DM_MAPIO_REMAPPED:
+			/* the bio has been remapped so dispatch it */
+			if (md->type == DM_TYPE_NVME_BIO_BASED)
+				ret = direct_make_request(bio);
+			else
+				ret = generic_make_request(bio);
+			break;
+		case DM_MAPIO_KILL:
+			bio_io_error(bio);
+			break;
+		case DM_MAPIO_REQUEUE:
+			bio->bi_status = BLK_STS_DM_REQUEUE;
+			noclone_endio(bio);
+			break;
+		default:
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		return ret;
+	}
+no_fast_path:
 	if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
 		return __process_bio(md, map, bio, ti);
 	else
@@ -1798,48 +1859,8 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 		return ret;
 	}
 
-	if (dm_table_supports_noclone(map) &&
-	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
-	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
-	    !bio_flagged(bio, BIO_CHAIN) &&
-	    likely(!bio_integrity(bio)) &&
-	    likely(!dm_stats_used(&md->stats))) {
-		int r;
-		struct dm_noclone *noclone;
-		struct dm_target *ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
-		if (unlikely(!dm_target_is_valid(ti)))
-			goto no_fast_path;
-		if (unlikely(bio_sectors(bio) > max_io_len(bio->bi_iter.bi_sector, ti)))
-			goto no_fast_path;
-		noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
-		if (unlikely(!noclone))
-			goto no_fast_path;
-		noclone->md = md;
-		noclone->start_time = jiffies;
-		noclone->orig_bi_end_io = bio->bi_end_io;
-		noclone->orig_bi_private = bio->bi_private;
-		bio->bi_end_io = noclone_endio;
-		bio->bi_private = noclone;
-		start_io_acct(md, bio);
-		r = ti->type->map(ti, bio);
-		ret = BLK_QC_T_NONE;
-		if (likely(r == DM_MAPIO_REMAPPED)) {
-			ret = generic_make_request(bio);
-		} else if (likely(r == DM_MAPIO_SUBMITTED)) {
-		} else if (r == DM_MAPIO_KILL) {
-			bio->bi_status = BLK_STS_IOERR;
-			noclone_endio(bio);
-		} else {
-			DMWARN("unimplemented target map return value: %d", r);
-			BUG();
-		}
-		goto put_table_ret;
-	}
-
-no_fast_path:
 	ret = dm_process_bio(md, map, bio);
 
-put_table_ret:
 	dm_put_live_table(md, srcu_idx);
 	return ret;
 }
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 6/9] dm: add per-bio-data support to noclone bio
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (4 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 5/9] dm: improve noclone bio support Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 7/9] dm: improve noclone_endio() to support multipath target Mike Snitzer
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Allocate additional space after the 'struct dm_clone' for the target
requested per_io_data_size and place DM_NOCLONE_MAGIC at the end.

Update both dm_per_bio_data() and dm_bio_from_per_bio_data() to branch
accordingly based on whether bio/data is from clone or noclone.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d84735be6927..7e022f840419 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -26,6 +26,8 @@
 #include <linux/pr.h>
 #include <linux/refcount.h>
 
+#include <asm/unaligned.h>
+
 #define DM_MSG_PREFIX "core"
 
 /*
@@ -105,16 +107,29 @@ struct dm_io {
 /*
  * One of these is allocated per noclone bio.
  */
+#define DM_NOCLONE_MAGIC 9693664
 struct dm_noclone {
 	struct mapped_device *md;
+	struct bio *bio;
 	bio_end_io_t *orig_bi_end_io;
 	void *orig_bi_private;
 	unsigned long start_time;
+	/* ... per-bio-data ... */
+	/* DM_NOCLONE_MAGIC */
 };
 
+static void noclone_endio(struct bio *bio);
+
 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 {
-	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+	struct dm_target_io *tio;
+
+	if (bio->bi_end_io == noclone_endio) {
+		struct dm_noclone *noclone = bio->bi_private;
+		return noclone + 1;
+	}
+
+	tio = container_of(bio, struct dm_target_io, clone);
 	if (!tio->inside_dm_io)
 		return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 	return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
@@ -124,9 +139,14 @@ EXPORT_SYMBOL_GPL(dm_per_bio_data);
 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 {
 	struct dm_io *io = (struct dm_io *)((char *)data + data_size);
-	if (io->magic == DM_IO_MAGIC)
+	unsigned magic = get_unaligned(&io->magic);
+	if (magic == DM_NOCLONE_MAGIC) {
+		struct dm_noclone *noclone = (struct dm_noclone *)data - 1;
+		return noclone->bio;
+	}
+	if (magic == DM_IO_MAGIC)
 		return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
-	BUG_ON(io->magic != DM_TIO_MAGIC);
+	BUG_ON(magic != DM_TIO_MAGIC);
 	return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 }
 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
@@ -1793,14 +1813,18 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
 
 		/* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
 		if (bio->bi_end_io != noclone_endio) {
-			noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
+			noclone = kmalloc_node(sizeof(*noclone) + ti->per_io_data_size + sizeof(unsigned),
+					       GFP_NOWAIT, md->numa_node_id);
 			if (unlikely(!noclone))
 				goto no_fast_path;
 
 			noclone->md = md;
 			noclone->start_time = jiffies;
+			noclone->bio = bio;
 			noclone->orig_bi_end_io = bio->bi_end_io;
 			noclone->orig_bi_private = bio->bi_private;
+			put_unaligned(DM_NOCLONE_MAGIC,
+				      (unsigned *)((char *)noclone + sizeof(*noclone) + ti->per_io_data_size));
 			bio->bi_end_io = noclone_endio;
 			bio->bi_private = noclone;
 		} else
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 7/9] dm: improve noclone_endio() to support multipath target
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (5 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 6/9] dm: add per-bio-data support to noclone bio Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 8/9] dm mpath: enable noclone support for bio-based Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 9/9] dm: remove unused _rq_tio_cache and _rq_cache Mike Snitzer
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

noclone_endio() must call the target's ->endio method if it exists.
Also must handle the various DM_ENDIO_* returns that are possible.

Factor out dm_bio_pushback_needed() for both dec_pending() and
noclone_endio() to use when handling BLK_STS_DM_REQUEUE.

Lastly, there is no need to conditionally set md->immutable_target in
__bind().  If the device only uses a single immutable target then it
should always be reflected in md->immutable_target.  This is important
because noclone_endio() benefits from this to get access to the
multipath dm_target without needing to add another member to the
'dm_noclone' structure.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 80 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e022f840419..0943913bceca 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -919,6 +919,28 @@ static int __noflush_suspending(struct mapped_device *md)
 	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 }
 
+static bool dm_bio_pushback_needed(struct mapped_device *md,
+				   struct bio *bio, blk_status_t *error)
+{
+	unsigned long flags;
+
+	/*
+	 * Target requested pushing back the I/O.
+	 */
+	if (__noflush_suspending(md)) {
+		spin_lock_irqsave(&md->deferred_lock, flags);
+		// FIXME: using bio_list_add_head() creates LIFO
+		bio_list_add_head(&md->deferred, bio);
+		spin_unlock_irqrestore(&md->deferred_lock, flags);
+		return true;
+	} else {
+		/* noflush suspend was interrupted. */
+		*error = BLK_STS_IOERR;
+	}
+
+	return false;
+}
+
 /*
  * Decrements the number of outstanding ios that a bio has been
  * cloned into, completing the original io if necc.
@@ -939,22 +961,12 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 	}
 
 	if (atomic_dec_and_test(&io->io_count)) {
-		if (io->status == BLK_STS_DM_REQUEUE) {
-			/*
-			 * Target requested pushing back the I/O.
-			 */
-			spin_lock_irqsave(&md->deferred_lock, flags);
-			if (__noflush_suspending(md))
-				/* NOTE early return due to BLK_STS_DM_REQUEUE below */
-				bio_list_add_head(&md->deferred, io->orig_bio);
-			else
-				/* noflush suspend was interrupted. */
-				io->status = BLK_STS_IOERR;
-			spin_unlock_irqrestore(&md->deferred_lock, flags);
-		}
+		bio = io->orig_bio;
+
+		if (unlikely(io->status == BLK_STS_DM_REQUEUE))
+			(void) dm_bio_pushback_needed(md, bio, &io->status);
 
 		io_error = io->status;
-		bio = io->orig_bio;
 
 		if (unlikely(dm_stats_used(&md->stats))) {
 			unsigned long duration = jiffies - io->start_time;
@@ -1041,8 +1053,33 @@ static void clone_endio(struct bio *bio)
 
 static void noclone_endio(struct bio *bio)
 {
+	blk_status_t error = bio->bi_status;
 	struct dm_noclone *noclone = bio->bi_private;
 	struct mapped_device *md = noclone->md;
+	struct dm_target *ti = NULL;
+	dm_endio_fn endio = NULL;
+
+	if (md->immutable_target) {
+		ti = md->immutable_target;
+		endio = ti->type->end_io;
+	}
+
+	if (endio) {
+		int r = endio(ti, bio, &error);
+		switch (r) {
+		case DM_ENDIO_REQUEUE:
+			error = BLK_STS_DM_REQUEUE;
+			/*FALLTHRU*/
+		case DM_ENDIO_DONE:
+			break;
+		case DM_ENDIO_INCOMPLETE:
+			/* The target will handle the io */
+			return;
+		default:
+			DMWARN("unimplemented target endio return value: %d", r);
+			BUG();
+		}
+	}
 
 	end_io_acct(md, bio, noclone->start_time);
 
@@ -1050,6 +1087,11 @@ static void noclone_endio(struct bio *bio)
 	bio->bi_private = noclone->orig_bi_private;
 	kfree(noclone);
 
+	if (unlikely(error == BLK_STS_DM_REQUEUE)) {
+		if (dm_bio_pushback_needed(md, bio, &bio->bi_status))
+			return;
+	}
+
 	bio_endio(bio);
 }
 
@@ -2249,15 +2291,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	if (request_based)
 		dm_stop_queue(q);
 
-	if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
-		/*
-		 * Leverage the fact that request-based DM targets and
-		 * NVMe bio based targets are immutable singletons
-		 * - used to optimize both dm_request_fn and dm_mq_queue_rq;
-		 *   and __process_bio.
-		 */
-		md->immutable_target = dm_table_get_immutable_target(t);
-	}
+	md->immutable_target = dm_table_get_immutable_target(t);
 
 	ret = __bind_mempools(md, t);
 	if (ret) {
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 8/9] dm mpath: enable noclone support for bio-based
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (6 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 7/9] dm: improve noclone_endio() to support multipath target Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  2019-02-20 21:44 ` [PATCH v2 9/9] dm: remove unused _rq_tio_cache and _rq_cache Mike Snitzer
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

Avoiding bio cloning (aka noclone) offers a slight advantage over bio
cloning for this read workload:
  fio --ioengine=psync --iodepth=1 --rw=read --bs=512 --direct=1 \
      --numjobs=12 --time_based --runtime=10 --group_reporting --name=/dev/mapper/mpath

clone   3331MB/s 3337MB/s 3355MB/s
noclone 3418MB/s 3407MB/s 3420MB/s

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2ee5e357a0a7..579330cd7a6a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1178,9 +1178,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
 	ti->num_write_zeroes_bios = 1;
-	if (m->queue_mode == DM_TYPE_BIO_BASED)
+	if (m->queue_mode == DM_TYPE_BIO_BASED) {
+		ti->no_clone = true;
 		ti->per_io_data_size = multipath_per_bio_data_size();
-	else
+	} else
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1584,11 +1585,11 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	return r;
 }
 
-static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *bio,
 				blk_status_t *error)
 {
 	struct multipath *m = ti->private;
-	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
+	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 	struct pgpath *pgpath = mpio->pgpath;
 	unsigned long flags;
 	int r = DM_ENDIO_DONE;
@@ -1611,7 +1612,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 	}
 
 	spin_lock_irqsave(&m->lock, flags);
-	bio_list_add(&m->queued_bios, clone);
+	bio_list_add(&m->queued_bios, bio);
 	spin_unlock_irqrestore(&m->lock, flags);
 	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
 		queue_work(kmultipathd, &m->process_queued_bios);
@@ -2011,7 +2012,7 @@ static int multipath_busy(struct dm_target *ti)
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
 		    DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 9/9] dm: remove unused _rq_tio_cache and _rq_cache
  2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
                   ` (7 preceding siblings ...)
  2019-02-20 21:44 ` [PATCH v2 8/9] dm mpath: enable noclone support for bio-based Mike Snitzer
@ 2019-02-20 21:44 ` Mike Snitzer
  8 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-20 21:44 UTC (permalink / raw)
  To: dm-devel

Also move dm_rq_target_io structure definition from dm-rq.h to dm-rq.c

Fixes: 6a23e05c2fe3c6 ("dm: remove legacy request-based IO path")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 16 ++++++++++++++++
 drivers/md/dm-rq.h | 16 ----------------
 drivers/md/dm.c    | 22 ++--------------------
 3 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index a20531e5f3b4..9428cd951e3b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -12,6 +12,22 @@
 
 #define DM_MSG_PREFIX "core-rq"
 
+/*
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+	struct mapped_device *md;
+	struct dm_target *ti;
+	struct request *orig, *clone;
+	struct kthread_work work;
+	blk_status_t error;
+	union map_info info;
+	struct dm_stats_aux stats_aux;
+	unsigned long duration_jiffies;
+	unsigned n_sectors;
+	unsigned completed;
+};
+
 #define DM_MQ_NR_HW_QUEUES 1
 #define DM_MQ_QUEUE_DEPTH 2048
 static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index b39245545229..1eea0da641db 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -16,22 +16,6 @@
 
 struct mapped_device;
 
-/*
- * One of these is allocated per request.
- */
-struct dm_rq_target_io {
-	struct mapped_device *md;
-	struct dm_target *ti;
-	struct request *orig, *clone;
-	struct kthread_work work;
-	blk_status_t error;
-	union map_info info;
-	struct dm_stats_aux stats_aux;
-	unsigned long duration_jiffies;
-	unsigned n_sectors;
-	unsigned completed;
-};
-
 /*
  * For request-based dm - the bio clones we allocate are embedded in these
  * structs.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0943913bceca..a58f35d31fd0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -188,9 +188,6 @@ struct table_device {
 	struct dm_dev dm_dev;
 };
 
-static struct kmem_cache *_rq_tio_cache;
-static struct kmem_cache *_rq_cache;
-
 /*
  * Bio-based DM's mempools' reserved IOs set by the user.
  */
@@ -252,20 +249,11 @@ static unsigned dm_get_numa_node(void)
 
 static int __init local_init(void)
 {
-	int r = -ENOMEM;
-
-	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
-	if (!_rq_tio_cache)
-		return r;
-
-	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
-				      __alignof__(struct request), 0, NULL);
-	if (!_rq_cache)
-		goto out_free_rq_tio_cache;
+	int r;
 
 	r = dm_uevent_init();
 	if (r)
-		goto out_free_rq_cache;
+		return r;
 
 	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 	if (!deferred_remove_workqueue) {
@@ -287,10 +275,6 @@ static int __init local_init(void)
 	destroy_workqueue(deferred_remove_workqueue);
 out_uevent_exit:
 	dm_uevent_exit();
-out_free_rq_cache:
-	kmem_cache_destroy(_rq_cache);
-out_free_rq_tio_cache:
-	kmem_cache_destroy(_rq_tio_cache);
 
 	return r;
 }
@@ -300,8 +284,6 @@ static void local_exit(void)
 	flush_scheduled_work();
 	destroy_workqueue(deferred_remove_workqueue);
 
-	kmem_cache_destroy(_rq_cache);
-	kmem_cache_destroy(_rq_tio_cache);
 	unregister_blkdev(_major, _name);
 	dm_uevent_exit();
 
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface
  2019-02-20 21:44 ` [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface Mike Snitzer
@ 2019-02-21  4:36   ` Mike Snitzer
  0 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-21  4:36 UTC (permalink / raw)
  To: dm-devel; +Cc: Ming Lei

On Wed, Feb 20 2019 at  4:44pm -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> There is no need to have DM core split discards on behalf of a DM target
> now that blk_queue_split() handles splitting discards based on the
> queue_limits.  A DM target just needs to set max_discard_sectors,
> discard_granularity, etc, in queue_limits.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm-cache-target.c  |  1 -
>  drivers/md/dm-raid.c          | 14 +++++++++-----
>  drivers/md/dm-thin.c          |  1 -
>  drivers/md/dm-zoned-target.c  |  1 -
>  drivers/md/dm.c               | 28 ++++++----------------------
>  include/linux/device-mapper.h |  6 ------
>  include/uapi/linux/dm-ioctl.h |  4 ++--
>  7 files changed, 17 insertions(+), 38 deletions(-)
> 

...

> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 7a774fcd0194..b988e178a523 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
>  	return ti->num_write_zeroes_bios;
>  }
>  
> -typedef bool (*is_split_required_fn)(struct dm_target *ti);
> -
> -static bool is_split_required_for_discard(struct dm_target *ti)
> -{
> -	return ti->split_discard_bios;
> -}
> -
>  static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
> -				       unsigned num_bios, bool is_split_required)
> +				       unsigned num_bios)
>  {
> -	unsigned len;
> +	unsigned len = ci->sector_count;
>  
>  	/*
>  	 * Even though the device advertised support for this type of
> @@ -1499,38 +1492,29 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
>  	if (!num_bios)
>  		return -EOPNOTSUPP;
>  
> -	if (!is_split_required)
> -		len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
> -	else
> -		len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
> -
>  	__send_duplicate_bios(ci, ti, num_bios, &len);
>  
> -	ci->sector += len;
> -	ci->sector_count -= len;
> -
>  	return 0;
>  }

The above was bogus, ci->sector and ci->sector_count must be updated.
Reintroducing adjustments based on 'len' fixed a discard crash Ming
reported.

Now fixed in linux-next.

Mike

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 5/9] dm: improve noclone bio support
  2019-02-20 21:44 ` [PATCH v2 5/9] dm: improve noclone bio support Mike Snitzer
@ 2019-02-22 10:59   ` Mikulas Patocka
  2019-02-22 15:22     ` Mike Snitzer
  0 siblings, 1 reply; 14+ messages in thread
From: Mikulas Patocka @ 2019-02-22 10:59 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel

Hi

I've fonud out that this patch causes timeout in the lvm test 
shell/activate-missing-segment.sh and some others.

Mikulas



On Wed, 20 Feb 2019, Mike Snitzer wrote:

> Leverage fact that dm_queue_split() enables use of noclone support even
> for targets that require additional splitting (via ti->max_io_len).
> 
> To achieve this move noclone processing from dm_make_request() to
> dm_process_bio() and check if bio->bi_end_io is already set to
> noclone_endio.  This also fixes dm_wq_work() to be able to support
> noclone bios (because it calls dm_process_bio()).
> 
> While at it, clean up ->map() return processing to be comparable to
> __map_bio() and add some code comments that explain why certain
> conditionals exist to prevent the use of noclone.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>  drivers/md/dm.c | 103 ++++++++++++++++++++++++++++++++++----------------------
>  1 file changed, 62 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 57919f211acc..d84735be6927 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1764,14 +1764,75 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
>  	 * If in ->make_request_fn we need to use blk_queue_split(), otherwise
>  	 * queue_limits for abnormal requests (e.g. discard, writesame, etc)
>  	 * won't be imposed.
> +	 *
> +	 * For normal READ and WRITE requests we benefit from upfront splitting
> +	 * via dm_queue_split() because we can then leverage noclone support below.
>  	 */
> -	if (current->bio_list) {
> +	if (current->bio_list && (bio->bi_end_io != noclone_endio)) {
> +		/*
> +		 * It is fine to use {blk,dm}_queue_split() before opting to use noclone
> +		 * because any ->bi_private and ->bi_end_io will get saved off below.
> +		 * Once noclone_endio() is called the bio_chain's endio will kick in.
> +		 * - __split_and_process_bio() can split further, but any targets that
> +		 *   require late _DM_ initiated splitting (e.g. dm_accept_partial_bio()
> +		 *   callers) shouldn't set ti->no_clone.
> +		 */
>  		if (is_abnormal_io(bio))
>  			blk_queue_split(md->queue, &bio);
>  		else
>  			dm_queue_split(md, ti, &bio);
>  	}
>  
> +	if (dm_table_supports_noclone(map) &&
> +	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
> +	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
> +	    likely(!bio_integrity(bio)) && /* integrity requires specialized processing */
> +	    likely(!dm_stats_used(&md->stats))) { /* noclone doesn't support dm-stats */
> +		int r;
> +		struct dm_noclone *noclone;
> +
> +		/* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
> +		if (bio->bi_end_io != noclone_endio) {
> +			noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
> +			if (unlikely(!noclone))
> +				goto no_fast_path;
> +
> +			noclone->md = md;
> +			noclone->start_time = jiffies;
> +			noclone->orig_bi_end_io = bio->bi_end_io;
> +			noclone->orig_bi_private = bio->bi_private;
> +			bio->bi_end_io = noclone_endio;
> +			bio->bi_private = noclone;
> +		} else
> +			noclone = bio->bi_private;
> +
> +		start_io_acct(md, bio);
> +		r = ti->type->map(ti, bio);
> +		switch (r) {
> +		case DM_MAPIO_SUBMITTED:
> +			break;
> +		case DM_MAPIO_REMAPPED:
> +			/* the bio has been remapped so dispatch it */
> +			if (md->type == DM_TYPE_NVME_BIO_BASED)
> +				ret = direct_make_request(bio);
> +			else
> +				ret = generic_make_request(bio);
> +			break;
> +		case DM_MAPIO_KILL:
> +			bio_io_error(bio);
> +			break;
> +		case DM_MAPIO_REQUEUE:
> +			bio->bi_status = BLK_STS_DM_REQUEUE;
> +			noclone_endio(bio);
> +			break;
> +		default:
> +			DMWARN("unimplemented target map return value: %d", r);
> +			BUG();
> +		}
> +
> +		return ret;
> +	}
> +no_fast_path:
>  	if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
>  		return __process_bio(md, map, bio, ti);
>  	else
> @@ -1798,48 +1859,8 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
>  		return ret;
>  	}
>  
> -	if (dm_table_supports_noclone(map) &&
> -	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
> -	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
> -	    !bio_flagged(bio, BIO_CHAIN) &&
> -	    likely(!bio_integrity(bio)) &&
> -	    likely(!dm_stats_used(&md->stats))) {
> -		int r;
> -		struct dm_noclone *noclone;
> -		struct dm_target *ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
> -		if (unlikely(!dm_target_is_valid(ti)))
> -			goto no_fast_path;
> -		if (unlikely(bio_sectors(bio) > max_io_len(bio->bi_iter.bi_sector, ti)))
> -			goto no_fast_path;
> -		noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
> -		if (unlikely(!noclone))
> -			goto no_fast_path;
> -		noclone->md = md;
> -		noclone->start_time = jiffies;
> -		noclone->orig_bi_end_io = bio->bi_end_io;
> -		noclone->orig_bi_private = bio->bi_private;
> -		bio->bi_end_io = noclone_endio;
> -		bio->bi_private = noclone;
> -		start_io_acct(md, bio);
> -		r = ti->type->map(ti, bio);
> -		ret = BLK_QC_T_NONE;
> -		if (likely(r == DM_MAPIO_REMAPPED)) {
> -			ret = generic_make_request(bio);
> -		} else if (likely(r == DM_MAPIO_SUBMITTED)) {
> -		} else if (r == DM_MAPIO_KILL) {
> -			bio->bi_status = BLK_STS_IOERR;
> -			noclone_endio(bio);
> -		} else {
> -			DMWARN("unimplemented target map return value: %d", r);
> -			BUG();
> -		}
> -		goto put_table_ret;
> -	}
> -
> -no_fast_path:
>  	ret = dm_process_bio(md, map, bio);
>  
> -put_table_ret:
>  	dm_put_live_table(md, srcu_idx);
>  	return ret;
>  }
> -- 
> 2.15.0
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 5/9] dm: improve noclone bio support
  2019-02-22 10:59   ` Mikulas Patocka
@ 2019-02-22 15:22     ` Mike Snitzer
  2019-02-22 16:56       ` Mike Snitzer
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Snitzer @ 2019-02-22 15:22 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel

On Fri, Feb 22 2019 at  5:59am -0500,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> Hi
> 
> I've fonud out that this patch causes timeout in the lvm test 
> shell/activate-missing-segment.sh and some others.

OK, I can reproduce with:
make check_local T=shell/activate-missing-segment.sh

Interestingly, it isn't hung in kernel, test can be interrupted.
And it is hanging at shell/activate-missing-segment.sh's : aux disable_dev "$dev1"

Other tests with "aux disable_dev" hang too
(e.g. shell/lvcreate-repair.sh, shell/vgreduce-usage.sh)

continued below:

> On Wed, 20 Feb 2019, Mike Snitzer wrote:
> 
> > Leverage fact that dm_queue_split() enables use of noclone support even
> > for targets that require additional splitting (via ti->max_io_len).
> > 
> > To achieve this move noclone processing from dm_make_request() to
> > dm_process_bio() and check if bio->bi_end_io is already set to
> > noclone_endio.  This also fixes dm_wq_work() to be able to support
> > noclone bios (because it calls dm_process_bio()).
> > 
> > While at it, clean up ->map() return processing to be comparable to
> > __map_bio() and add some code comments that explain why certain
> > conditionals exist to prevent the use of noclone.
> > 
> > Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > ---
> >  drivers/md/dm.c | 103 ++++++++++++++++++++++++++++++++++----------------------
> >  1 file changed, 62 insertions(+), 41 deletions(-)
> > 
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 57919f211acc..d84735be6927 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> > @@ -1764,14 +1764,75 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
> >  	 * If in ->make_request_fn we need to use blk_queue_split(), otherwise
> >  	 * queue_limits for abnormal requests (e.g. discard, writesame, etc)
> >  	 * won't be imposed.
> > +	 *
> > +	 * For normal READ and WRITE requests we benefit from upfront splitting
> > +	 * via dm_queue_split() because we can then leverage noclone support below.
> >  	 */
> > -	if (current->bio_list) {
> > +	if (current->bio_list && (bio->bi_end_io != noclone_endio)) {
> > +		/*
> > +		 * It is fine to use {blk,dm}_queue_split() before opting to use noclone
> > +		 * because any ->bi_private and ->bi_end_io will get saved off below.
> > +		 * Once noclone_endio() is called the bio_chain's endio will kick in.
> > +		 * - __split_and_process_bio() can split further, but any targets that
> > +		 *   require late _DM_ initiated splitting (e.g. dm_accept_partial_bio()
> > +		 *   callers) shouldn't set ti->no_clone.
> > +		 */
> >  		if (is_abnormal_io(bio))
> >  			blk_queue_split(md->queue, &bio);
> >  		else
> >  			dm_queue_split(md, ti, &bio);
> >  	}
> >  
> > +	if (dm_table_supports_noclone(map) &&
> > +	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
> > +	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
> > +	    likely(!bio_integrity(bio)) && /* integrity requires specialized processing */
> > +	    likely(!dm_stats_used(&md->stats))) { /* noclone doesn't support dm-stats */
> > +		int r;
> > +		struct dm_noclone *noclone;
> > +
> > +		/* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
> > +		if (bio->bi_end_io != noclone_endio) {

This conditional is causing it ^

Very strange...

> > +			noclone = kmalloc_node(sizeof(*noclone), GFP_NOWAIT, md->numa_node_id);
> > +			if (unlikely(!noclone))
> > +				goto no_fast_path;
> > +
> > +			noclone->md = md;
> > +			noclone->start_time = jiffies;
> > +			noclone->orig_bi_end_io = bio->bi_end_io;
> > +			noclone->orig_bi_private = bio->bi_private;
> > +			bio->bi_end_io = noclone_endio;
> > +			bio->bi_private = noclone;
> > +		} else
> > +			noclone = bio->bi_private;
> > +
> > +		start_io_acct(md, bio);
> > +		r = ti->type->map(ti, bio);
> > +		switch (r) {
> > +		case DM_MAPIO_SUBMITTED:
> > +			break;
> > +		case DM_MAPIO_REMAPPED:
> > +			/* the bio has been remapped so dispatch it */
> > +			if (md->type == DM_TYPE_NVME_BIO_BASED)
> > +				ret = direct_make_request(bio);
> > +			else
> > +				ret = generic_make_request(bio);
> > +			break;
> > +		case DM_MAPIO_KILL:
> > +			bio_io_error(bio);
> > +			break;
> > +		case DM_MAPIO_REQUEUE:
> > +			bio->bi_status = BLK_STS_DM_REQUEUE;
> > +			noclone_endio(bio);
> > +			break;
> > +		default:
> > +			DMWARN("unimplemented target map return value: %d", r);
> > +			BUG();
> > +		}
> > +
> > +		return ret;
> > +	}
> > +no_fast_path:
> >  	if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
> >  		return __process_bio(md, map, bio, ti);
> >  	else

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 5/9] dm: improve noclone bio support
  2019-02-22 15:22     ` Mike Snitzer
@ 2019-02-22 16:56       ` Mike Snitzer
  0 siblings, 0 replies; 14+ messages in thread
From: Mike Snitzer @ 2019-02-22 16:56 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel

On Fri, Feb 22 2019 at 10:22am -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Fri, Feb 22 2019 at  5:59am -0500,
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > Hi
> > 
> > I've fonud out that this patch causes timeout in the lvm test 
> > shell/activate-missing-segment.sh and some others.
> 
> OK, I can reproduce with:
> make check_local T=shell/activate-missing-segment.sh
> 
> Interestingly, it isn't hung in kernel, test can be interrupted.
> And it is hanging at shell/activate-missing-segment.sh's : aux disable_dev "$dev1"
> 
> Other tests with "aux disable_dev" hang too
> (e.g. shell/lvcreate-repair.sh, shell/vgreduce-usage.sh)
> 
> continued below:
> 
> > On Wed, 20 Feb 2019, Mike Snitzer wrote:
> > 
> > > Leverage fact that dm_queue_split() enables use of noclone support even
> > > for targets that require additional splitting (via ti->max_io_len).
> > > 
> > > To achieve this move noclone processing from dm_make_request() to
> > > dm_process_bio() and check if bio->bi_end_io is already set to
> > > noclone_endio.  This also fixes dm_wq_work() to be able to support
> > > noclone bios (because it calls dm_process_bio()).
> > > 
> > > While at it, clean up ->map() return processing to be comparable to
> > > __map_bio() and add some code comments that explain why certain
> > > conditionals exist to prevent the use of noclone.
> > > 
> > > Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> > > ---
> > >  drivers/md/dm.c | 103 ++++++++++++++++++++++++++++++++++----------------------
> > >  1 file changed, 62 insertions(+), 41 deletions(-)
> > > 
> > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > index 57919f211acc..d84735be6927 100644
> > > --- a/drivers/md/dm.c
> > > +++ b/drivers/md/dm.c
> > > @@ -1764,14 +1764,75 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
> > >  	 * If in ->make_request_fn we need to use blk_queue_split(), otherwise
> > >  	 * queue_limits for abnormal requests (e.g. discard, writesame, etc)
> > >  	 * won't be imposed.
> > > +	 *
> > > +	 * For normal READ and WRITE requests we benefit from upfront splitting
> > > +	 * via dm_queue_split() because we can then leverage noclone support below.
> > >  	 */
> > > -	if (current->bio_list) {
> > > +	if (current->bio_list && (bio->bi_end_io != noclone_endio)) {
> > > +		/*
> > > +		 * It is fine to use {blk,dm}_queue_split() before opting to use noclone
> > > +		 * because any ->bi_private and ->bi_end_io will get saved off below.
> > > +		 * Once noclone_endio() is called the bio_chain's endio will kick in.
> > > +		 * - __split_and_process_bio() can split further, but any targets that
> > > +		 *   require late _DM_ initiated splitting (e.g. dm_accept_partial_bio()
> > > +		 *   callers) shouldn't set ti->no_clone.
> > > +		 */
> > >  		if (is_abnormal_io(bio))
> > >  			blk_queue_split(md->queue, &bio);
> > >  		else
> > >  			dm_queue_split(md, ti, &bio);
> > >  	}
> > >  
> > > +	if (dm_table_supports_noclone(map) &&
> > > +	    (bio_op(bio) == REQ_OP_READ || bio_op(bio) == REQ_OP_WRITE) &&
> > > +	    likely(!(bio->bi_opf & REQ_PREFLUSH)) &&
> > > +	    likely(!bio_integrity(bio)) && /* integrity requires specialized processing */
> > > +	    likely(!dm_stats_used(&md->stats))) { /* noclone doesn't support dm-stats */
> > > +		int r;
> > > +		struct dm_noclone *noclone;
> > > +
> > > +		/* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
> > > +		if (bio->bi_end_io != noclone_endio) {
> 
> This conditional is causing it ^
> 
> Very strange...

Added some debugging:

Feb 22 10:46:28 thegoat kernel: device-mapper: core: dm_process_bio: 253:2: bio=00000000c70f3c09 bio->bi_iter.bi_sector=0 len=256 bio->bi_end_io != noclone_endio
Feb 22 10:46:28 thegoat kernel: device-mapper: core: dm_process_bio: 253:0: bio=00000000c70f3c09 bio->bi_iter.bi_sector=2048 len=256 bio->bi_end_io = noclone_endio
...
Feb 22 10:46:28 thegoat kernel: device-mapper: core: dm_process_bio: 253:2: bio=00000000da9cadf3 bio->bi_iter.bi_sector=0 len=256 bio->bi_end_io != noclone_endio
Feb 22 10:46:28 thegoat kernel: device-mapper: core: dm_process_bio: 253:0: bio=00000000da9cadf3 bio->bi_iter.bi_sector=2048 len=256 bio->bi_end_io = noclone_endio

So the same noclone bio is entering devices in the stacked DM device
(perfectly normal).

This fixes the issue:

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a9856f1986df..8fbe4d5ec8e4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1831,15 +1831,16 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
            likely(!bio_integrity(bio)) && /* integrity requires specialized processing */
            likely(!dm_stats_used(&md->stats))) { /* noclone doesn't support dm-stats */
                int r;
-               struct dm_noclone *noclone;
-
-               /* Already been here if bio->bi_end_io is noclone_endio, reentered via dm_wq_work() */
-               if (bio->bi_end_io != noclone_endio) {
+               /*
+                * Only allocate noclone if in ->make_request_fn, otherwise
+                * leak could occur due to reentering (e.g. from dm_wq_work)
+                */
+               if (current->bio_list) {
+                       struct dm_noclone *noclone;
                        noclone = kmalloc_node(sizeof(*noclone) + ti->per_io_data_size + sizeof(unsigned),
                                               GFP_NOWAIT, md->numa_node_id);
                        if (unlikely(!noclone))
                                goto no_fast_path;
-
                        noclone->md = md;
                        noclone->start_time = jiffies;
                        noclone->orig_bi_end_io = bio->bi_end_io;

That said, I've reasoned through other edge cases that need to be dealt
with and will layer other fixes in addition to this one.

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-02-22 16:56 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-02-20 21:44 [PATCH v2 0/9] dm: changes staged in linux-next for 5.1 so far Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 1/9] dm: update dm_process_bio() to split bio if in ->make_request_fn() Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 2/9] dm: eliminate 'split_discard_bios' flag from DM target interface Mike Snitzer
2019-02-21  4:36   ` Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 3/9] dm: refactor start_io_acct and end_io_acct Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 4/9] dm: implement noclone optimization for bio-based Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 5/9] dm: improve noclone bio support Mike Snitzer
2019-02-22 10:59   ` Mikulas Patocka
2019-02-22 15:22     ` Mike Snitzer
2019-02-22 16:56       ` Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 6/9] dm: add per-bio-data support to noclone bio Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 7/9] dm: improve noclone_endio() to support multipath target Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 8/9] dm mpath: enable noclone support for bio-based Mike Snitzer
2019-02-20 21:44 ` [PATCH v2 9/9] dm: remove unused _rq_tio_cache and _rq_cache Mike Snitzer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).