From martin.petersen@oracle.com Fri Dec 18 00:00:03 2009 Date: Fri, 18 Dec 2009 02:54:02 -0500 From: Martin K. Petersen To: Eric Sandeen , Mike Snitzer , Christian Kujau , Heinz Diehl Cc: jens.axboe@oracle.com Subject: [PATCH] block: Fix topology stacking for data and discard alignment I discovered a couple of corner cases in the topology stacking that required major surgery to get right. The old stacking approach treated '0' as a soft limit because it could essentially mean both 'default' and 'naturally aligned'. The new function handles this correctly by honoring any granularity reported along with the alignment_offset. The patch below also fixes the problem with the default discard granularity that caused warnings when stacking non-trim devices. For testing I ended up instrumenting the stacking function in userland and feeding it a bunch of device limit combinations that I could compare to a list of expected results. However, there may be caveats lurking when testing within the confines of the kernel. So I'd appreciate it if you guys could try out the following patch before we pass it upstream... Thanks, Martin -- Martin K. Petersen Oracle Linux Engineering The stacking code incorrectly scaled up the data offset in some cases causing misaligned devices to report alignment. Rewrite the stacking algorithm to remedy this and apply the same alignment principles to the discard handling. Signed-off-by: Martin K. Petersen --- diff --git a/block/blk-settings.c b/block/blk-settings.c index 6ae118d..e14fcbc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -517,9 +517,8 @@ static unsigned int lcm(unsigned int a, unsigned int b) int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset) { - int ret; - - ret = 0; + sector_t alignment; + unsigned int top, bottom, granularity; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -537,6 +536,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); + granularity = max(b->physical_block_size, b->io_min); + alignment = b->alignment_offset - (offset & (granularity - 1)); + + if (t->alignment_offset != alignment) { + + top = max(t->physical_block_size, t->io_min) + + t->alignment_offset; + bottom = granularity + alignment; + + if (max(top, bottom) & (min(top, bottom) - 1)) + t->misaligned = 1; + } + t->logical_block_size = max(t->logical_block_size, b->logical_block_size); @@ -544,54 +556,55 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->physical_block_size); t->io_min = max(t->io_min, b->io_min); + t->io_opt = lcm(t->io_opt, b->io_opt); + t->no_cluster |= b->no_cluster; t->discard_zeroes_data &= b->discard_zeroes_data; - /* Bottom device offset aligned? */ - if (offset && - (offset & (b->physical_block_size - 1)) != b->alignment_offset) { + if (t->physical_block_size & (t->logical_block_size - 1)) { + t->physical_block_size = t->logical_block_size; t->misaligned = 1; - ret = -1; } - /* - * Temporarily disable discard granularity. It's currently buggy - * since we default to 0 for discard_granularity, hence this - * "failure" will always trigger for non-zero offsets. - */ -#if 0 - if (offset && - (offset & (b->discard_granularity - 1)) != b->discard_alignment) { - t->discard_misaligned = 1; - ret = -1; + if (t->io_min & (t->physical_block_size - 1)) { + t->io_min = t->physical_block_size; + t->misaligned = 1; } -#endif - /* If top has no alignment offset, inherit from bottom */ - if (!t->alignment_offset) - t->alignment_offset = - b->alignment_offset & (b->physical_block_size - 1); + if (t->io_opt & (t->physical_block_size - 1)) { + t->io_opt = 0; + t->misaligned = 1; + } - if (!t->discard_alignment) - t->discard_alignment = - b->discard_alignment & (b->discard_granularity - 1); + t->alignment_offset = lcm(t->alignment_offset, alignment) + & (max(t->physical_block_size, t->io_min) - 1); - /* Top device aligned on logical block boundary? */ - if (t->alignment_offset & (t->logical_block_size - 1)) { + if (t->alignment_offset & (t->logical_block_size - 1)) t->misaligned = 1; - ret = -1; - } - /* Find lcm() of optimal I/O size and granularity */ - t->io_opt = lcm(t->io_opt, b->io_opt); - t->discard_granularity = lcm(t->discard_granularity, - b->discard_granularity); + /* Discard alignment and granularity */ + if (b->discard_granularity) { + + alignment = b->discard_alignment - + (offset & (b->discard_granularity - 1)); + + if (t->discard_granularity != 0 && + t->discard_alignment != alignment) { + top = t->discard_granularity + t->discard_alignment; + bottom = b->discard_granularity + alignment; - /* Verify that optimal I/O size is a multiple of io_min */ - if (t->io_min && t->io_opt % t->io_min) - ret = -1; + /* Verify that top and bottom intervals line up */ + if (max(top, bottom) & (min(top, bottom) - 1)) + t->discard_misaligned = 1; + } + + t->discard_granularity = max(t->discard_granularity, + b->discard_granularity); + t->discard_alignment = lcm(t->discard_alignment, alignment) & + (t->discard_granularity - 1); + } - return ret; + return t->misaligned ? -1 : 0; } EXPORT_SYMBOL(blk_stack_limits);