Linux-SCSI Archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  3:30 [RFC] Thin provisioning bits Martin K. Petersen
@ 2009-10-30  3:30 ` Martin K. Petersen
  2009-10-30  4:28   ` Douglas Gilbert
  2009-10-30  5:11   ` Christoph Hellwig
  0 siblings, 2 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-10-30  3:30 UTC (permalink / raw
  To: hch, axboe, matthew, linux-scsi; +Cc: Martin K. Petersen

Implement a prepare discard function that sends either WRITE SAME(16) or
UNMAP(10) depending on parameters indicated by the device in the block
limits VPD.

Extract unmap constraints and report them to the block layer.

Based in part by a patch by Christoph Hellwig <hch@lst.de>.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c |   93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/sd.h |    2 +
 2 files changed, 95 insertions(+), 0 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9093c72..72e0d78 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -264,6 +264,15 @@ sd_show_app_tag_own(struct device *dev, struct device_attribute *attr,
 	return snprintf(buf, 20, "%u\n", sdkp->ATO);
 }
 
+static ssize_t
+sd_show_thin_provisioning(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+	return snprintf(buf, 20, "%u\n", sdkp->thin_provisioning);
+}
+
 static struct device_attribute sd_disk_attrs[] = {
 	__ATTR(cache_type, S_IRUGO|S_IWUSR, sd_show_cache_type,
 	       sd_store_cache_type),
@@ -274,6 +283,7 @@ static struct device_attribute sd_disk_attrs[] = {
 	       sd_store_manage_start_stop),
 	__ATTR(protection_type, S_IRUGO, sd_show_protection_type, NULL),
 	__ATTR(app_tag_own, S_IRUGO, sd_show_app_tag_own, NULL),
+	__ATTR(thin_provisioning, S_IRUGO, sd_show_thin_provisioning, NULL),
 	__ATTR_NULL,
 };
 
@@ -399,6 +409,57 @@ static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
 }
 
 /**
+ * sd_prepare_discard - unmap blocks on thinly provisioned device
+ * @rq: Request to prepare
+ *
+ * Will issue either UNMAP or WRITE SAME(16) depending on preference
+ * indicated by target device.
+ **/
+static int sd_prepare_discard(struct request *rq)
+{
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	struct bio *bio = rq->bio;
+	sector_t sector = bio->bi_sector;
+	unsigned int num = bio_sectors(bio);
+
+	if (sdkp->device->sector_size == 4096) {
+		sector >>= 3;
+		num >>= 3;
+	}
+
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	rq->timeout = SD_TIMEOUT;
+
+	memset(rq->cmd, 0, rq->cmd_len);
+
+	if (sdkp->unmap) {
+		char *buf = kmap_atomic(bio_page(bio), KM_USER0);
+
+		rq->cmd[0] = UNMAP;
+		rq->cmd[8] = 24;
+		rq->cmd_len = 10;
+
+		/* Ensure that data length matches payload */
+		rq->__data_len = bio->bi_size = bio->bi_io_vec->bv_len = 24;
+
+		put_unaligned_be16(6 + 16, &buf[0]);
+		put_unaligned_be16(16, &buf[2]);
+		put_unaligned_be64(sector, &buf[8]);
+		put_unaligned_be32(num, &buf[16]);
+
+		kunmap_atomic(buf, KM_USER0);
+	} else {
+		rq->cmd[0] = WRITE_SAME_16;
+		rq->cmd[1] = 0x8; /* UNMAP */
+		put_unaligned_be64(sector, &rq->cmd[2]);
+		put_unaligned_be32(num, &rq->cmd[10]);
+		rq->cmd_len = 16;
+	}
+
+	return BLKPREP_OK;
+}
+
+/**
  *	sd_init_command - build a scsi (read or write) command from
  *	information in the request structure.
  *	@SCpnt: pointer to mid-level's per scsi command structure that
@@ -418,6 +479,13 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	int ret, host_dif;
 	unsigned char protect;
 
+	/*
+	 * Discard request come in as REQ_TYPE_FS but we turn them into
+	 * block PC requests to make life easier.
+	 */
+	if (blk_discard_rq(rq))
+		ret = sd_prepare_discard(rq);
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		ret = scsi_setup_blk_pc_cmnd(sdp, rq);
 		goto out;
@@ -1432,6 +1500,9 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
 		sd_printk(KERN_NOTICE, sdkp,
 			  "physical block alignment offset: %u\n", alignment);
 
+	if (buffer[14] & 0x80)
+		sdkp->thin_provisioning = 1;
+
 	sdkp->capacity = lba + 1;
 	return sector_size;
 }
@@ -1863,6 +1934,7 @@ void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
  */
 static void sd_read_block_limits(struct scsi_disk *sdkp)
 {
+	struct scsi_device *sdp = sdkp->device;
 	unsigned int sector_sz = sdkp->device->sector_size;
 	char *buffer;
 
@@ -1877,6 +1949,27 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
 	blk_queue_io_opt(sdkp->disk->queue,
 			 get_unaligned_be32(&buffer[12]) * sector_sz);
 
+	if (sdkp->thin_provisioning && buffer[3] == 0x3c) {
+		unsigned int lba_count, desc_count, granularity;
+		unsigned int max_sectors = UINT_MAX;
+
+		lba_count = get_unaligned_be32(&buffer[20]);
+		desc_count = get_unaligned_be32(&buffer[24]);
+		granularity = get_unaligned_be32(&buffer[28]) * sector_sz;
+
+		if (lba_count && desc_count) {
+			sdkp->unmap = 1;
+			max_sectors = lba_count * sector_sz >> 9;
+		}
+
+		if (granularity == 0)
+			granularity = sector_sz;
+
+		blk_queue_max_discard_sectors(sdp->request_queue, max_sectors);
+		blk_queue_discard_granularity(sdp->request_queue, granularity);
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, sdkp->disk->queue);
+	}
+
 	kfree(buffer);
 }
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index e374804..43d3caf 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -60,6 +60,8 @@ struct scsi_disk {
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
 	unsigned	DPOFUA : 1;	/* state of disk DPOFUA bit */
 	unsigned	first_scan : 1;
+	unsigned	thin_provisioning : 1;
+	unsigned	unmap : 1;
 };
 #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  3:30 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
@ 2009-10-30  4:28   ` Douglas Gilbert
  2009-10-30  4:53     ` Martin K. Petersen
  2009-10-30  5:11   ` Christoph Hellwig
  1 sibling, 1 reply; 18+ messages in thread
From: Douglas Gilbert @ 2009-10-30  4:28 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: hch, axboe, matthew, linux-scsi

Martin K. Petersen wrote:
> Implement a prepare discard function that sends either WRITE SAME(16) or
> UNMAP(10) depending on parameters indicated by the device in the block
> limits VPD.

And if both are supported by the logical unit, the patch
prefers UNMAP?

Doug Gilbert

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  4:28   ` Douglas Gilbert
@ 2009-10-30  4:53     ` Martin K. Petersen
  2009-10-30 16:02       ` Ric Wheeler
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2009-10-30  4:53 UTC (permalink / raw
  To: dgilbert; +Cc: Martin K. Petersen, hch, axboe, matthew, linux-scsi

>>>>> "Doug" == Douglas Gilbert <dgilbert@interlog.com> writes:

Doug> And if both are supported by the logical unit, the patch prefers
Doug> UNMAP?

Yes.

SBC states that if the device reports MAXIMUM UNMAP LBA COUNT > 1 and
MAXIMUM UNMAP DESCRIPTOR COUNT > 1 then the device supports UNMAP.  And
in that case that's what I'll issue.  In all other cases I'll send out
WRITE SAME(16).  I believe that approach is what's currently considered
best practice.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  3:30 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
  2009-10-30  4:28   ` Douglas Gilbert
@ 2009-10-30  5:11   ` Christoph Hellwig
  2009-11-02 13:32     ` Martin K. Petersen
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-10-30  5:11 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: hch, axboe, matthew, linux-scsi

On Thu, Oct 29, 2009 at 11:30:09PM -0400, Martin K. Petersen wrote:
> +sd_show_thin_provisioning(struct device *dev, struct device_attribute *attr,
> +			  char *buf)
> +{
> +	struct scsi_disk *sdkp = to_scsi_disk(dev);
> +
> +	return snprintf(buf, 20, "%u\n", sdkp->thin_provisioning);
> +}

If we want to expose this we'd better do it at the block layer so that
it also coverts non-scsi dicard support.  Besides the mtd one already in
tree I also plan to add a discard command to virtio-blk.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  4:53     ` Martin K. Petersen
@ 2009-10-30 16:02       ` Ric Wheeler
  0 siblings, 0 replies; 18+ messages in thread
From: Ric Wheeler @ 2009-10-30 16:02 UTC (permalink / raw
  To: Martin K. Petersen
  Cc: dgilbert, hch, axboe, matthew, linux-scsi, linux-fsdevel

On 10/30/2009 12:53 AM, Martin K. Petersen wrote:
>>>>>> "Doug" == Douglas Gilbert<dgilbert@interlog.com>  writes:
>>>>>>              
> Doug>  And if both are supported by the logical unit, the patch prefers
> Doug>  UNMAP?
>
> Yes.
>
> SBC states that if the device reports MAXIMUM UNMAP LBA COUNT>  1 and
> MAXIMUM UNMAP DESCRIPTOR COUNT>  1 then the device supports UNMAP.  And
> in that case that's what I'll issue.  In all other cases I'll send out
> WRITE SAME(16).  I believe that approach is what's currently considered
> best practice.
>    

This sounds like the correct thing to do.

We should at the same time try to unify the file system mount options so 
we roll out the testing in a careful way.

Specifically, I would suggest that we default to "not" issuing discards 
by default and that we try to use the same mount option for any file 
system that supports barrier discards. My worry is that we will fry 
SSD's (like the reported issues with the Intel SSD's and Windows 7) or 
have horrific performance on arrays that are not tuned for fine grained 
discards :-)

Ric


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-10-30  5:11   ` Christoph Hellwig
@ 2009-11-02 13:32     ` Martin K. Petersen
  2009-11-03 15:12       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-02 13:32 UTC (permalink / raw
  To: Christoph Hellwig; +Cc: Martin K. Petersen, axboe, matthew, linux-scsi

>>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:

Christoph> If we want to expose this we'd better do it at the block
Christoph> layer

I agree.  I've already had several requests for "is this blkdev thinly
provisioned?"...

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-11-02 13:32     ` Martin K. Petersen
@ 2009-11-03 15:12       ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2009-11-03 15:12 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: Christoph Hellwig, axboe, matthew, linux-scsi

On Mon, Nov 02, 2009 at 08:32:08AM -0500, Martin K. Petersen wrote:
> >>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:
> 
> Christoph> If we want to expose this we'd better do it at the block
> Christoph> layer
> 
> I agree.  I've already had several requests for "is this blkdev thinly
> provisioned?"...

Btw, I wonder if we should make a distinction between truely thing
provisioned and just discard capable devices like SSDs or other FTLs.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Thin provisioning update
@ 2009-11-04  4:25 Martin K. Petersen
  2009-11-04  4:25 ` [PATCH 1/2] block: Expose discard granularity Martin K. Petersen
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-04  4:25 UTC (permalink / raw
  To: hch, axboe, matthew, linux-scsi

Christoph,

Here are some more recent bits for you to tinker with.

The first patch exports unmap granularity and alignment up the stack.  I have
also implemented support for topology stacking of these parameters.  A reported
discard granularity of 0 means fully provisioned.

In the second patch I made a few changes to the TP detection and extraction of
VPD values.

I have been hunting for heuristics for the WRITE SAME case but haven't found any
good ones.  I'm thinking we probably want to distinguish between a real array
with NV-backed unmap queueing and a thinly provisioned disk on a virtualization
server where there might be a real impact from having to zero out partial blocks.
I guess that's easy to handle with virtio but I'm not so sure how to do it when
emulating a SCSI disk.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] block: Expose discard granularity
  2009-11-04  4:25 Thin provisioning update Martin K. Petersen
@ 2009-11-04  4:25 ` Martin K. Petersen
  2009-11-04  4:25 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
  2009-11-09 14:20 ` Thin provisioning update Christoph Hellwig
  2 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-04  4:25 UTC (permalink / raw
  To: hch, axboe, matthew, linux-scsi; +Cc: Martin K. Petersen

While SSDs track block usage on a per-sector basis, RAID arrays often
have allocation blocks that are bigger.  Allow the discard granularity
and alignment to be set and teach the topology stacking logic how to
handle them.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-settings.c   |   46 ++++++++++++++++++++++++++++++++++++----------
 block/blk-sysfs.c      |   22 ++++++++++++++++++++++
 block/genhd.c          |   12 ++++++++++++
 fs/partitions/check.c  |   12 ++++++++++++
 include/linux/blkdev.h |   18 ++++++++++++++++++
 include/linux/genhd.h  |    1 +
 6 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8..7f986ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,7 +96,10 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
-	lim->max_discard_sectors = SAFE_MAX_SECTORS;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -488,6 +491,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+	if (a && b)
+		return (a * b) / gcd(a, b);
+	else if (b)
+		return b;
+
+	return a;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
@@ -502,6 +515,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
+	int ret;
+
+	ret = 0;
+
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -531,7 +548,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (offset &&
 	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
+	}
+
+	if (offset &&
+	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+		t->discard_misaligned = 1;
+		ret = -1;
 	}
 
 	/* If top has no alignment offset, inherit from bottom */
@@ -539,23 +562,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->alignment_offset =
 			b->alignment_offset & (b->physical_block_size - 1);
 
+	if (!t->discard_alignment)
+		t->discard_alignment =
+			b->discard_alignment & (b->discard_granularity - 1);
+
 	/* Top device aligned on logical block boundary? */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
 	}
 
-	/* Find lcm() of optimal I/O size */
-	if (t->io_opt && b->io_opt)
-		t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-	else if (b->io_opt)
-		t->io_opt = b->io_opt;
+	/* Find lcm() of optimal I/O size and granularity */
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->discard_granularity = lcm(t->discard_granularity,
+				     b->discard_granularity);
 
 	/* Verify that optimal I/O size is a multiple of io_min */
 	if (t->io_min && t->io_opt % t->io_min)
-		return -1;
+		ret = -1;
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81a..3147145 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,16 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_io_opt(q), page);
 }
 
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +303,16 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
 	.show = queue_io_opt_show,
 };
 
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -328,6 +348,8 @@ static struct attribute *default_attrs[] = {
 	&queue_physical_block_size_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..b11a4ad 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }
 
+static ssize_t disk_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..64bc899 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
 
+ssize_t part_discard_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	p->start_sect = start;
 	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+							      start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 221cecd..3b67221 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,12 +312,15 @@ struct queue_limits {
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
+	unsigned int		discard_granularity;
+	unsigned int		discard_alignment;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
 	unsigned char		misaligned;
+	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -1134,6 +1137,21 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
 	return q->limits.alignment_offset;
 }
 
+static inline int queue_discard_alignment(struct request_queue *q)
+{
+	if (q->limits.discard_misaligned)
+		return -1;
+
+	return q->limits.discard_alignment;
+}
+
+static inline int queue_sector_discard_alignment(struct request_queue *q,
+						 sector_t sector)
+{
+	return ((sector << 9) - q->limits.discard_alignment)
+		& (q->limits.discard_granularity - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45..c6c0c41 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -91,6 +91,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	sector_t alignment_offset;
+	unsigned int discard_alignment;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support
  2009-11-04  4:25 Thin provisioning update Martin K. Petersen
  2009-11-04  4:25 ` [PATCH 1/2] block: Expose discard granularity Martin K. Petersen
@ 2009-11-04  4:25 ` Martin K. Petersen
  2009-11-09 14:20 ` Thin provisioning update Christoph Hellwig
  2 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-04  4:25 UTC (permalink / raw
  To: hch, axboe, matthew, linux-scsi; +Cc: Martin K. Petersen

Implement a prepare discard function that sends either WRITE SAME(16) or
UNMAP(10) depending on parameters indicated by the device in the block
limits VPD.

Extract unmap constraints and report them to the block layer.

Based in part by a patch by Christoph Hellwig <hch@lst.de>.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c |  105 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/sd.h |    2 +
 2 files changed, 107 insertions(+), 0 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9093c72..80a5cb5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -264,6 +264,15 @@ sd_show_app_tag_own(struct device *dev, struct device_attribute *attr,
 	return snprintf(buf, 20, "%u\n", sdkp->ATO);
 }
 
+static ssize_t
+sd_show_thin_provisioning(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+	return snprintf(buf, 20, "%u\n", sdkp->thin_provisioning);
+}
+
 static struct device_attribute sd_disk_attrs[] = {
 	__ATTR(cache_type, S_IRUGO|S_IWUSR, sd_show_cache_type,
 	       sd_store_cache_type),
@@ -274,6 +283,7 @@ static struct device_attribute sd_disk_attrs[] = {
 	       sd_store_manage_start_stop),
 	__ATTR(protection_type, S_IRUGO, sd_show_protection_type, NULL),
 	__ATTR(app_tag_own, S_IRUGO, sd_show_app_tag_own, NULL),
+	__ATTR(thin_provisioning, S_IRUGO, sd_show_thin_provisioning, NULL),
 	__ATTR_NULL,
 };
 
@@ -399,6 +409,57 @@ static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
 }
 
 /**
+ * sd_prepare_discard - unmap blocks on thinly provisioned device
+ * @rq: Request to prepare
+ *
+ * Will issue either UNMAP or WRITE SAME(16) depending on preference
+ * indicated by target device.
+ **/
+static int sd_prepare_discard(struct request *rq)
+{
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	struct bio *bio = rq->bio;
+	sector_t sector = bio->bi_sector;
+	unsigned int num = bio_sectors(bio);
+
+	if (sdkp->device->sector_size == 4096) {
+		sector >>= 3;
+		num >>= 3;
+	}
+
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	rq->timeout = SD_TIMEOUT;
+
+	memset(rq->cmd, 0, rq->cmd_len);
+
+	if (sdkp->unmap) {
+		char *buf = kmap_atomic(bio_page(bio), KM_USER0);
+
+		rq->cmd[0] = UNMAP;
+		rq->cmd[8] = 24;
+		rq->cmd_len = 10;
+
+		/* Ensure that data length matches payload */
+		rq->__data_len = bio->bi_size = bio->bi_io_vec->bv_len = 24;
+
+		put_unaligned_be16(6 + 16, &buf[0]);
+		put_unaligned_be16(16, &buf[2]);
+		put_unaligned_be64(sector, &buf[8]);
+		put_unaligned_be32(num, &buf[16]);
+
+		kunmap_atomic(buf, KM_USER0);
+	} else {
+		rq->cmd[0] = WRITE_SAME_16;
+		rq->cmd[1] = 0x8; /* UNMAP */
+		put_unaligned_be64(sector, &rq->cmd[2]);
+		put_unaligned_be32(num, &rq->cmd[10]);
+		rq->cmd_len = 16;
+	}
+
+	return BLKPREP_OK;
+}
+
+/**
  *	sd_init_command - build a scsi (read or write) command from
  *	information in the request structure.
  *	@SCpnt: pointer to mid-level's per scsi command structure that
@@ -418,6 +479,13 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	int ret, host_dif;
 	unsigned char protect;
 
+	/*
+	 * Discard request come in as REQ_TYPE_FS but we turn them into
+	 * block PC requests to make life easier.
+	 */
+	if (blk_discard_rq(rq))
+		ret = sd_prepare_discard(rq);
+
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		ret = scsi_setup_blk_pc_cmnd(sdp, rq);
 		goto out;
@@ -1432,6 +1500,9 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
 		sd_printk(KERN_NOTICE, sdkp,
 			  "physical block alignment offset: %u\n", alignment);
 
+	if (buffer[14] & 0x80)
+		sdkp->thin_provisioning = 1;
+
 	sdkp->capacity = lba + 1;
 	return sector_size;
 }
@@ -1863,6 +1934,7 @@ void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
  */
 static void sd_read_block_limits(struct scsi_disk *sdkp)
 {
+	struct request_queue *q = sdkp->disk->queue;
 	unsigned int sector_sz = sdkp->device->sector_size;
 	char *buffer;
 
@@ -1877,6 +1949,39 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
 	blk_queue_io_opt(sdkp->disk->queue,
 			 get_unaligned_be32(&buffer[12]) * sector_sz);
 
+	if (sdkp->thin_provisioning) {
+		unsigned int max_sectors = 0xffffffff;
+		unsigned int granularity = 1;
+		unsigned int alignment = 0;
+
+		if (buffer[3] == 0x3c) { /* TP-enhanced BL page length */
+			unsigned int lba_count, desc_count;
+
+			lba_count = get_unaligned_be32(&buffer[20]);
+			desc_count = get_unaligned_be32(&buffer[24]);
+
+			if (lba_count && desc_count) {
+				sdkp->unmap = 1;
+				max_sectors = lba_count * sector_sz >> 9;
+			}
+
+			granularity = get_unaligned_be32(&buffer[28]);
+
+			if (granularity == 0)
+				granularity = 1;
+
+			if (buffer[32] & 0x80)
+				alignment = get_unaligned_be32(&buffer[32])
+					& ~(1 << 31);
+		}
+
+		blk_queue_max_discard_sectors(q, max_sectors);
+		q->limits.discard_granularity = granularity * sector_sz;
+		q->limits.discard_alignment = alignment * sector_sz;
+
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	}
+
 	kfree(buffer);
 }
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index e374804..43d3caf 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -60,6 +60,8 @@ struct scsi_disk {
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
 	unsigned	DPOFUA : 1;	/* state of disk DPOFUA bit */
 	unsigned	first_scan : 1;
+	unsigned	thin_provisioning : 1;
+	unsigned	unmap : 1;
 };
 #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
 
-- 
1.6.0.6


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-04  4:25 Thin provisioning update Martin K. Petersen
  2009-11-04  4:25 ` [PATCH 1/2] block: Expose discard granularity Martin K. Petersen
  2009-11-04  4:25 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
@ 2009-11-09 14:20 ` Christoph Hellwig
  2009-11-09 19:34   ` Christoph Hellwig
  2 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-11-09 14:20 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: hch, axboe, matthew, linux-scsi

On Tue, Nov 03, 2009 at 11:25:38PM -0500, Martin K. Petersen wrote:
> Christoph,
> 
> Here are some more recent bits for you to tinker with.
> 
> The first patch exports unmap granularity and alignment up the stack.  I have
> also implemented support for topology stacking of these parameters.  A reported
> discard granularity of 0 means fully provisioned.
> 
> In the second patch I made a few changes to the TP detection and extraction of
> VPD values.

I was offline for a while, but I have another half-finished alternative
to it.  Which only exposes the discard granularity/alignment at the
whole disk level and does the adjustment in sd.  I think I actually
prefer it - I'll post it for review as soon as I have cought up with my
backlog.

> I have been hunting for heuristics for the WRITE SAME case but haven't found any
> good ones.  I'm thinking we probably want to distinguish between a real array
> with NV-backed unmap queueing and a thinly provisioned disk on a virtualization
> server where there might be a real impact from having to zero out partial blocks.
> I guess that's easy to handle with virtio but I'm not so sure how to do it when
> emulating a SCSI disk.

We might just offer UNMAP for virtualized disks for now to sort this
out.  But the primary use case at least for now will be ide TRIM for
virtualized environments with virtio as a second and scsi a distant
third for now.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-09 14:20 ` Thin provisioning update Christoph Hellwig
@ 2009-11-09 19:34   ` Christoph Hellwig
  2009-11-10  5:37     ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-11-09 19:34 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: hch, axboe, matthew, linux-scsi

So actually I'll take my previous comment back.  I think these patches
are good enough to put them in now to have something we can build on.
We might revise it later to move the adjustments of the trim ranges into
sd, but let's make sure we have something in for the merge window.

These patches work fine with my qemu and scsi_debug testing, and later
this week when I'm home I'll also try it with the OCZ SSD.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-09 19:34   ` Christoph Hellwig
@ 2009-11-10  5:37     ` Martin K. Petersen
  2009-11-10 10:51       ` Jens Axboe
  2009-11-10 13:16       ` Christoph Hellwig
  0 siblings, 2 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-10  5:37 UTC (permalink / raw
  To: axboe; +Cc: Christoph Hellwig, matthew, James.Bottomley, linux-scsi

>>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:

Christoph> So actually I'll take my previous comment back.  I think
Christoph> these patches are good enough to put them in now to have
Christoph> something we can build on.

Ok.  Jens, please queue the patch below.  And then we'll have to see
about getting James to rebase and/or postpone the SCSI portion to round
#2.


block: Expose discard granularity

While SSDs track block usage on a per-sector basis, RAID arrays often
have allocation blocks that are bigger.  Allow the discard granularity
and alignment to be set and teach the topology stacking logic how to
handle them.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8..7f986ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,7 +96,10 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
-	lim->max_discard_sectors = SAFE_MAX_SECTORS;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -488,6 +491,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+	if (a && b)
+		return (a * b) / gcd(a, b);
+	else if (b)
+		return b;
+
+	return a;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
@@ -502,6 +515,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
+	int ret;
+
+	ret = 0;
+
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -531,7 +548,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (offset &&
 	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
+	}
+
+	if (offset &&
+	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+		t->discard_misaligned = 1;
+		ret = -1;
 	}
 
 	/* If top has no alignment offset, inherit from bottom */
@@ -539,23 +562,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->alignment_offset =
 			b->alignment_offset & (b->physical_block_size - 1);
 
+	if (!t->discard_alignment)
+		t->discard_alignment =
+			b->discard_alignment & (b->discard_granularity - 1);
+
 	/* Top device aligned on logical block boundary? */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
 	}
 
-	/* Find lcm() of optimal I/O size */
-	if (t->io_opt && b->io_opt)
-		t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-	else if (b->io_opt)
-		t->io_opt = b->io_opt;
+	/* Find lcm() of optimal I/O size and granularity */
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->discard_granularity = lcm(t->discard_granularity,
+				     b->discard_granularity);
 
 	/* Verify that optimal I/O size is a multiple of io_min */
 	if (t->io_min && t->io_opt % t->io_min)
-		return -1;
+		ret = -1;
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81a..3147145 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,16 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_io_opt(q), page);
 }
 
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +303,16 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
 	.show = queue_io_opt_show,
 };
 
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -328,6 +348,8 @@ static struct attribute *default_attrs[] = {
 	&queue_physical_block_size_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..b11a4ad 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }
 
+static ssize_t disk_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..64bc899 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
 
+ssize_t part_discard_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	p->start_sect = start;
 	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+							      start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 221cecd..3b67221 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,12 +312,15 @@ struct queue_limits {
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
+	unsigned int		discard_granularity;
+	unsigned int		discard_alignment;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
 	unsigned char		misaligned;
+	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -1134,6 +1137,21 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
 	return q->limits.alignment_offset;
 }
 
+static inline int queue_discard_alignment(struct request_queue *q)
+{
+	if (q->limits.discard_misaligned)
+		return -1;
+
+	return q->limits.discard_alignment;
+}
+
+static inline int queue_sector_discard_alignment(struct request_queue *q,
+						 sector_t sector)
+{
+	return ((sector << 9) - q->limits.discard_alignment)
+		& (q->limits.discard_granularity - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45..c6c0c41 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -91,6 +91,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	sector_t alignment_offset;
+	unsigned int discard_alignment;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-10  5:37     ` Martin K. Petersen
@ 2009-11-10 10:51       ` Jens Axboe
  2009-11-10 13:16       ` Christoph Hellwig
  1 sibling, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2009-11-10 10:51 UTC (permalink / raw
  To: Martin K. Petersen
  Cc: Christoph Hellwig, matthew, James.Bottomley, linux-scsi

On Tue, Nov 10 2009, Martin K. Petersen wrote:
> >>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:
> 
> Christoph> So actually I'll take my previous comment back.  I think
> Christoph> these patches are good enough to put them in now to have
> Christoph> something we can build on.
> 
> Ok.  Jens, please queue the patch below.  And then we'll have to see
> about getting James to rebase and/or postpone the SCSI portion to round
> #2.

Queued up for 2.6.33.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-10  5:37     ` Martin K. Petersen
  2009-11-10 10:51       ` Jens Axboe
@ 2009-11-10 13:16       ` Christoph Hellwig
  2009-11-10 18:58         ` Martin K. Petersen
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2009-11-10 13:16 UTC (permalink / raw
  To: Martin K. Petersen
  Cc: axboe, Christoph Hellwig, matthew, James.Bottomley, linux-scsi

On Tue, Nov 10, 2009 at 12:37:37AM -0500, Martin K. Petersen wrote:
> >>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:
> 
> Christoph> So actually I'll take my previous comment back.  I think
> Christoph> these patches are good enough to put them in now to have
> Christoph> something we can build on.
> 
> Ok.  Jens, please queue the patch below.  And then we'll have to see
> about getting James to rebase and/or postpone the SCSI portion to round
> #2.

Can't we please get both patches in through one tree?  Everything else
is a bit of a pain to manage.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-10 13:16       ` Christoph Hellwig
@ 2009-11-10 18:58         ` Martin K. Petersen
  2009-11-10 19:10           ` James Bottomley
  0 siblings, 1 reply; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-10 18:58 UTC (permalink / raw
  To: Christoph Hellwig
  Cc: Martin K. Petersen, axboe, matthew, James.Bottomley, linux-scsi

>>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:

>> Ok.  Jens, please queue the patch below.  And then we'll have to see
>> about getting James to rebase and/or postpone the SCSI portion to
>> round #2.

Christoph> Can't we please get both patches in through one tree?
Christoph> Everything else is a bit of a pain to manage.

I'm intimately familiar with said pain.  Pretty much everything I do
touches both block and scsi :/

But James has been pretty good about rebasing scsi-misc on top of Jens'
tree.  That's how we've done all the other tricky merges.

James, what do you think?

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-10 18:58         ` Martin K. Petersen
@ 2009-11-10 19:10           ` James Bottomley
  2009-11-10 23:06             ` Martin K. Petersen
  0 siblings, 1 reply; 18+ messages in thread
From: James Bottomley @ 2009-11-10 19:10 UTC (permalink / raw
  To: Martin K. Petersen; +Cc: Christoph Hellwig, axboe, matthew, linux-scsi

On Tue, 2009-11-10 at 13:58 -0500, Martin K. Petersen wrote:
> >>>>> "Christoph" == Christoph Hellwig <hch@infradead.org> writes:
> 
> >> Ok.  Jens, please queue the patch below.  And then we'll have to see
> >> about getting James to rebase and/or postpone the SCSI portion to
> >> round #2.
> 
> Christoph> Can't we please get both patches in through one tree?
> Christoph> Everything else is a bit of a pain to manage.
> 
> I'm intimately familiar with said pain.  Pretty much everything I do
> touches both block and scsi :/
> 
> But James has been pretty good about rebasing scsi-misc on top of Jens'
> tree.  That's how we've done all the other tricky merges.
> 
> James, what do you think?

SCSI pretty much pioneered the use of postmerge trees, which is how I do
this ... if everyone's happy, that's what I'll do this time.

James




^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Thin provisioning update
  2009-11-10 19:10           ` James Bottomley
@ 2009-11-10 23:06             ` Martin K. Petersen
  0 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2009-11-10 23:06 UTC (permalink / raw
  To: James Bottomley
  Cc: Martin K. Petersen, Christoph Hellwig, axboe, matthew, linux-scsi

>>>>> "James" == James Bottomley <James.Bottomley@HansenPartnership.com> writes:

James> SCSI pretty much pioneered the use of postmerge trees, which is
James> how I do this ... if everyone's happy, that's what I'll do this
James> time.

In the meantime I'll try to keep my TP branch up to date for people who
want to tinker with this.

  http://www.kernel.org/pub/scm/linux/kernel/git/mkp/linux-2.6-mkp.git

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2009-11-10 23:08 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-04  4:25 Thin provisioning update Martin K. Petersen
2009-11-04  4:25 ` [PATCH 1/2] block: Expose discard granularity Martin K. Petersen
2009-11-04  4:25 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
2009-11-09 14:20 ` Thin provisioning update Christoph Hellwig
2009-11-09 19:34   ` Christoph Hellwig
2009-11-10  5:37     ` Martin K. Petersen
2009-11-10 10:51       ` Jens Axboe
2009-11-10 13:16       ` Christoph Hellwig
2009-11-10 18:58         ` Martin K. Petersen
2009-11-10 19:10           ` James Bottomley
2009-11-10 23:06             ` Martin K. Petersen
  -- strict thread matches above, loose matches on Subject: below --
2009-10-30  3:30 [RFC] Thin provisioning bits Martin K. Petersen
2009-10-30  3:30 ` [PATCH 2/2] sd: WRITE SAME(16) / UNMAP support Martin K. Petersen
2009-10-30  4:28   ` Douglas Gilbert
2009-10-30  4:53     ` Martin K. Petersen
2009-10-30 16:02       ` Ric Wheeler
2009-10-30  5:11   ` Christoph Hellwig
2009-11-02 13:32     ` Martin K. Petersen
2009-11-03 15:12       ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).