--- linux-2.4.30/drivers/md/raid1.c.pre-fr1 Mon Apr 4 03:42:19 2005 +++ linux-2.4.30/drivers/md/raid1.c Wed Apr 6 22:38:41 2005 @@ -20,6 +20,28 @@ * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Changes by Peter T. Breuer 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap attached on setfaulty (mark bad) + * - bitmap marked during normal i/o if faulty disk + * - bitmap used to skip nondirty blocks during sync + * - bitmap removed on set active + * + * Minor changes are needed in raid1.h (extra fields in conf) and in + * md.c (support hotadd directly after setfaulty, or disk recognition). + * + * More changes by PTB 20/2/2003 to let the bitmap always be present and + * thus allow asynchronous mirror writes by using it as a journal log. + * + * Changes by PTB 10/8/2004 to redo read-balancing so that it reads + * from the fastest disk, as determined by latency testing every so + * often. + * + * Changes by PTB 6/1/2005 to make read errors not fault the disk out + * of the array but cause retries instead. And also (with CORRECT set) + * trigger rewrite of the bad sector. */ #include @@ -32,7 +54,15 @@ #define MD_DRIVER #define MD_PERSONALITY +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +#define MAX_WORK_PER_DISK (128 * 8) +#define MAX_TEST_PER_DISK 64 +#define LATENCY_OLD_WEIGHT 9 +#define LATENCY_NEW_WEIGHT 1 +#define LATENCY_SUM_WEIGHT (LATENCY_OLD_WEIGHT + LATENCY_NEW_WEIGHT) +#else #define MAX_WORK_PER_DISK 128 +#endif /* CONFIG_MD_FR1 */ #define NR_RESERVED_BUFS 32 @@ -50,11 +80,19 @@ #define PRINTK(x...) do { } while (0) #endif +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +#include "bitmap.h" +#endif /* CONFIG_MD_FR1 */ static mdk_personality_t raid1_personality; static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +/* module params */ +static int async; /* async writes */ +#endif /* CONFIG_MD_FR1 */ + static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) { /* return a linked list of "cnt" struct buffer_heads. @@ -325,6 +363,9 @@ { raid1_conf_t *conf = mddev_to_conf(mddev); int i, disks = MD_SB_DISKS; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + kdev_t dev = *rdev; +#endif /* CONFIG_MD_RAID1_READ_WRITE_CORRECT */ unsigned long flags; /* @@ -332,6 +373,30 @@ * now we use the first available disk. */ +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Uh, no. Choose the next disk if we can, not the first. + */ + md_spin_lock_irqsave(&conf->device_lock, flags); + for (i = 0; i < conf->raid_disks; i++) { + if (conf->mirrors[i].dev == dev) + break; + } + i++; + if (i >= conf->raid_disks) + i = 0; + for (; i < conf->raid_disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; + return (0); + } + } + md_spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * If for some reason we found nothing, dropthru and use the old + * routine. + */ +#endif /* CONFIG_MD_RAID1_READ_WRITE_CORRECT */ md_spin_lock_irqsave(&conf->device_lock, flags); for (i = 0; i < disks; i++) { if (conf->mirrors[i].operational) { @@ -414,9 +531,27 @@ /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) - md_error (r1_bh->mddev, bh->b_dev); - else + if (!uptodate) { +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Only fault disk out of array on write error, not read. + */ + if (r1_bh->cmd == WRITE) + if (printk(KERN_ALERT + "raid1: erroring bh WRITE for sector %ld\n", + bh->b_rsector), 1) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + md_error (r1_bh->mddev, bh->b_dev); +#ifdef CONFIG_MD_RAID1_READ_WRITE_CORRECT + } else { /* tell next time we're here that we're a retry */ + printk(KERN_ALERT + "raid1: set retry bit on bh READ for sector %ld\n", + bh->b_rsector); + set_bit(R1BH_ReadRetry, &r1_bh->state); + } +#endif /* CONFIG_MD_RAID1_READ_WRITE_CORRECT */ + + } else /* * Set R1BH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher @@ -438,7 +573,21 @@ * we have only one buffer_head on the read side */ - if (uptodate) { + if (uptodate +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* Give up and error if we're last */ + || atomic_dec_and_test(&r1_bh->remaining) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + ) { +#ifdef CONFIG_MD_RAID1_READ_WRITE_CORRECT + if (uptodate && test_bit(R1BH_ReadRewrite, &r1_bh->state)) { + /* Success at last - rewrite failed reads */ + r1_bh->cmd = SPECIAL; + raid1_reschedule_retry(r1_bh); + return; + } else +#endif /* CONFIG_MD_RAID1_READ_WRITE_CORRECT */ + raid1_end_bh_io(r1_bh, uptodate); return; } @@ -447,6 +596,13 @@ */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * if not uptodate and not the last possible try, + * bh will be rescheduled and repointed while on the + * queue, by raid1_map. + */ +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ raid1_reschedule_retry(r1_bh); return; } @@ -456,10 +612,39 @@ * * Let's see if all mirrored write operations have finished * already. + * + * In any case, do the end io early on the master bh if we are + * uptodate, and AsyncIO is set on the bh. We set AsyncPhase + * when this happens, so we don't do it twice, inadvertently. */ + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + if (uptodate + && test_bit(R1BH_AsyncIO, &r1_bh->state) + && !test_and_set_bit(R1BH_AsyncPhase, &r1_bh->state)) { - if (atomic_dec_and_test(&r1_bh->remaining)) + struct buffer_head *mbh = r1_bh->master_bh; + + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + PRINTK(KERN_DEBUG "raid1: async end i/o on sectors %lu-%lu\n", + mbh->b_rsector, mbh->b_rsector + (mbh->b_size >> 9) - 1); + + io_request_done(mbh->b_rsector, conf, + test_bit(R1BH_SyncPhase, &r1_bh->state)); + mbh->b_end_io(mbh, uptodate); + } +#endif /* CONFIG_MD_FR1 */ + + if (atomic_dec_and_test(&r1_bh->remaining)) { +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + if (test_and_set_bit(R1BH_AsyncIO, &r1_bh->state)) { + /* we made a copy for the buffer, remove it now */ + kfree(bh->b_data); + } +#endif /* CONFIG_MD_FR1 */ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); + } } /* @@ -520,7 +705,7 @@ * Don't touch anything for sequential reads. */ - if (this_sector == conf->mirrors[new_disk].head_position) + if (0 && /* PTB */ this_sector == conf->mirrors[new_disk].head_position) goto rb_out; /* @@ -531,7 +716,16 @@ */ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { - conf->sect_count = 0; + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + PRINTK(KERN_INFO + "raid1: disk %d latency %d abandoned after %d sectors\n", + new_disk, + conf->latency[new_disk], + conf->sect_count); + + /* PTB move on to run a short test on the next disk */ +#endif /* CONFIG_MD_FR1 */ #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92) /* Work around a compiler bug in egcs-2.92.11 19980921 */ @@ -546,6 +740,39 @@ } while ((conf->mirrors[new_disk].write_only) || (!conf->mirrors[new_disk].operational)); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* PTB if tested all, need to choose best */ + if (new_disk == conf->last_source) { + + int fastest = -1; + unsigned long best_latency = 0x7fffffff; + int i; + + for (i = 0; i < conf->raid_disks; i++) { + if (conf->mirrors[i].write_only + || !conf->mirrors[i].operational) + continue; + if (conf->latency[i] <= best_latency) { + best_latency = conf->latency[i]; + fastest = i; + } + } + if (fastest >= 0) + new_disk = fastest; + conf->mirrors[new_disk].sect_limit = MAX_WORK_PER_DISK; + conf->last_source = new_disk; + } else { + /* PTB only a short test run */ + conf->mirrors[new_disk].sect_limit = MAX_TEST_PER_DISK; + } + + conf->sect_count = 0; + PRINTK(KERN_DEBUG + "raid1: choosing disk %d latency %d\n", + new_disk, + conf->latency[new_disk]); +#endif /* CONFIG_MD_FR1 */ + goto rb_out; } @@ -596,6 +823,11 @@ int disks = MD_SB_DISKS; int i, sum_bhs = 0; struct mirror_info *mirror; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + int sum_nobhs = 0; + struct bitmap * bitmap = conf->bitmap; + char * async_data; // copy of buffer used for async writes +#endif /* CONFIG_MD_FR1 */ kdev_t dev; if (!buffer_locked(bh)) @@ -635,6 +867,10 @@ r1_bh->master_bh = bh; r1_bh->mddev = mddev; r1_bh->cmd = rw; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + r1_bh->start_jiffies = jiffies; /* PTB record start time */ + async_data = NULL; +#endif /* CONFIG_MD_FR1 */ if (rw == READ) { /* @@ -653,6 +889,20 @@ /* bh_req->b_rsector = bh->n_rsector; */ bh_req->b_end_io = raid1_end_request; bh_req->b_private = r1_bh; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + atomic_set(&r1_bh->remaining, 0); + /* count target devices under spinlock */ + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < disks; i++) { + if (!conf->mirrors[i].operational + || !conf->mirrors[i].used_slot) { + continue; + } + atomic_inc(&r1_bh->remaining); + } + md_spin_unlock_irq(&conf->device_lock); +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + generic_make_request (rw, bh_req); return 0; } @@ -662,11 +912,65 @@ */ bhl = raid1_alloc_bh(conf, conf->raid_disks); + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + if (bitmap->active(bitmap)) { + + int err = bitmap->setbits(bitmap, (bitmap_offset_t) (bh->b_rsector >> 1), + bh->b_size >> 10); + + /* + * PTB Do async i/o if we marked the bitmap (so it's safe to) + * and we are supposed to. + */ + if (async && err >= 0) { + async_data = kmalloc(bh->b_size, GFP_KERNEL); + if (async_data) { + memcpy(async_data, bh->b_data, bh->b_size); + set_bit(R1BH_AsyncIO, &r1_bh->state); + } + } + /* + * PTB Even if the async bit is not set then we STILL need to + * balance the setbits above with a clearbits in the end_io + * whether setbits errored or not above. That's because + * setbits errors if the bitmap page is not there and + * then we can only count attempted writes in the bitmap, + * not actual writes, so we have to balance that with + * attempted clears. And we do. See the end_io. + */ + } +#endif /* CONFIG_MD_FR1 */ spin_lock_irq(&conf->device_lock); for (i = 0; i < disks; i++) { struct buffer_head *mbh; - if (!conf->mirrors[i].operational) + if (!conf->mirrors[i].operational) { + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + struct bitmap * bitmap = conf->bitmap; + + if (!conf->mirrors[i].used_slot) { + continue; + } + + /* notionally mark bitmap here */ + if (sum_nobhs++ <= 0) { + PRINTK(KERN_DEBUG "raid1: mark mirror %d blk %lu-%lu\n", + i, bh->b_rsector >> 1, + (bh->b_rsector >> 1) + (bh->b_size >> 10) - 1); + } + + if (!conf->bitmap_dirty && bitmap->active(bitmap)) { + conf->bitmap_dirty = 1; + MD_SB_EVENTS_LO(mddev->sb) = + mddev->sb->events_lo; + MD_SB_EVENTS_HI(mddev->sb) = + mddev->sb->events_hi; + } + +#endif /* CONFIG_MD_FR1 */ continue; + } /* * We should use a private pool (size depending on NR_REQUEST), @@ -703,6 +1007,10 @@ mbh->b_size = bh->b_size; mbh->b_page = bh->b_page; mbh->b_data = bh->b_data; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + mbh->b_data = + test_bit(R1BH_AsyncIO, &r1_bh->state)? async_data : bh->b_data; +#endif /* CONFIG_MD_FR1 */ mbh->b_list = BUF_LOCKED; mbh->b_end_io = raid1_end_request; mbh->b_private = r1_bh; @@ -719,6 +1027,9 @@ return 0; } md_atomic_set(&r1_bh->remaining, sum_bhs); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + r1_bh->nonoperational = sum_nobhs; +#endif /* CONFIG_MD_FR1 */ /* * We have to be a bit careful about the semaphore above, thats @@ -769,6 +1080,85 @@ #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +static int +raid1_create_bitmap(mddev_t *mddev) { + + struct bitmap * bitmap; + unsigned long blocks; + raid1_conf_t *conf = mddev_to_conf(mddev); + + /* need size to have been set already */ + blocks = mddev->sb->size << 1; + + bitmap = kmalloc (sizeof (*bitmap), GFP_KERNEL); + if (!bitmap) { + printk(KERN_WARNING "raid1: out of memory for bitmap head\n"); + return -ENOMEM; + } + + if (bitmap_init (bitmap, blocks) < 0) { + printk(KERN_WARNING "raid1: failed to init bitmap\n"); + kfree(bitmap); + return -ENOMEM; + } + + /* take the spinlock for the ops on the configuration */ + spin_lock_irq(&conf->segment_lock); + conf->bitmap = bitmap; + conf->bitmap_dirty = 0; + spin_unlock_irq(&conf->segment_lock); + return 0; +} + +static void +raid1_remove_bitmap (mddev_t *mddev) { + + raid1_conf_t *conf = mddev_to_conf(mddev); + struct bitmap * bitmap; + + spin_lock_irq(&conf->segment_lock); + bitmap = conf->bitmap; + if (!bitmap) { + spin_unlock_irq(&conf->segment_lock); + return; + } + conf->bitmap = NULL; + spin_unlock_irq(&conf->segment_lock); + + bitmap_destr(bitmap); + kfree(bitmap); +} + +static int +raid1_start_bitmap (mddev_t *mddev) { + + raid1_conf_t *conf = mddev_to_conf(mddev); + struct bitmap * bitmap; + + spin_lock_irq(&conf->segment_lock); + bitmap = conf->bitmap; + spin_unlock_irq(&conf->segment_lock); + if (!bitmap) { + return -EINVAL; + } + + if (bitmap->active(bitmap)) { + printk(KERN_WARNING "raid1: bitmap %x already active!\n", + (unsigned) bitmap); + return 0; + } + if (bitmap->start(bitmap, md_event(mddev->sb)) < 0) { + printk(KERN_WARNING "raid1: bitmap %x failed to start!\n", + (unsigned) bitmap); + return -EINVAL; + } + + PRINTK(KERN_DEBUG "raid1: made bitmap %x\n", (unsigned) bitmap); + return 0; +} +#endif /* CONFIG_MD_FR1 */ + static void mark_disk_bad (mddev_t *mddev, int failed) { raid1_conf_t *conf = mddev_to_conf(mddev); @@ -777,6 +1167,13 @@ mirror->operational = 0; mark_disk_faulty(sb->disks+mirror->number); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* + * Activate the bitmap on a mirror just marked faulty (and + * nonoperational). + */ + raid1_start_bitmap (mddev); +#endif /* CONFIG_MD_FR1 */ mark_disk_nonsync(sb->disks+mirror->number); mark_disk_inactive(sb->disks+mirror->number); if (!mirror->write_only) @@ -848,6 +1245,14 @@ for (i = 0; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* + * Remove repeats from debug printout. + */ + if (i > 0 && memcmp(tmp, &conf->mirrors[i-1], sizeof(*tmp)) == 0) { + continue; + } +#endif /* CONFIG_MD_FR1 */ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", i, tmp->spare,tmp->operational, tmp->number,tmp->raid_disk,tmp->used_slot, @@ -939,16 +1344,36 @@ case DISKOP_SPARE_WRITE: case DISKOP_SPARE_INACTIVE: +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + PRINTK(KERN_DEBUG "raid1: diskop SPARE %s\n", + state == DISKOP_SPARE_WRITE ? "WRITE" : + state == DISKOP_SPARE_INACTIVE ? "INACTIVE" : + state == DISKOP_SPARE_ACTIVE ? "ACTIVE" : "" + ); +#endif /* CONFIG_MD_FR1 */ /* * Find the spare disk ... (can only be in the 'high' * area of the array) */ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { tmp = conf->mirrors + i; - if (tmp->spare && tmp->number == (*d)->number) { +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + if (tmp->spare + && (tmp->number == (*d)->number + /* + * I'm not sure we now need to allow match by + * device number too. FIXME. + */ + || tmp->dev == MKDEV((*d)->major,(*d)->minor))) { spare_disk = i; break; } +#else + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } +#endif /* CONFIG_MD_FR1 */ } if (spare_disk == -1) { MD_BUG(); @@ -1104,6 +1529,10 @@ fdisk->spare = 0; fdisk->write_only = 0; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + PRINTK(KERN_DEBUG "raid1: diskop SPARE device %x now ACTIVE\n", + fdisk->dev); +#endif /* CONFIG_MD_FR1 */ /* * if we activate a spare, we definitely replace a * non-operational disk slot in the 'low' area of @@ -1115,6 +1544,11 @@ break; case DISKOP_HOT_REMOVE_DISK: + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + PRINTK(KERN_DEBUG "raid1: diskop HOT REMOVE\n"); +#endif /* CONFIG_MD_FR1 */ + rdisk = conf->mirrors + removed_disk; if (rdisk->spare && (removed_disk < conf->raid_disks)) { @@ -1148,6 +1582,11 @@ adisk->head_position = 0; conf->nr_disks++; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + PRINTK(KERN_DEBUG "raid1: diskop HOT ADDed mirror %d disk %d bitmap %x\n", + added_disk, adisk->number, (unsigned)conf->bitmap); +#endif /* CONFIG_MD_FR1 */ + break; default: @@ -1292,6 +1731,13 @@ case READA: dev = bh->b_dev; raid1_map (mddev, &bh->b_dev); +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* raid1_map incorrectly used to change target to + * 0th disk always - now I hope it does a + * better job that before and switches target to + * next disk in the mirror. + */ +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ if (bh->b_dev == dev) { printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); raid1_end_bh_io(r1_bh, 0); @@ -1398,6 +1844,22 @@ int block_nr; int buffs; kdev_t dev; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* + * Will need to count mirror components currently with a bitmap + * which have been marked faulty and nonoperational at some + * point beforehand, and have been accumulating marks on the + * bitmap to indicate dirty blocks that need syncing. + */ + struct bitmap * bitmap = conf->bitmap; + int count, block_not_dirty; + int targets[MD_SB_DISKS]; + /* + * PTB discount the skipped sectors back to the md.c code + */ + extern atomic_t md_throttle[]; + +#endif /* CONFIG_MD_FR1 */ if (!sector_nr) { /* we want enough buffers to hold twice the window of 128*/ @@ -1406,9 +1868,29 @@ if (buffs < 2) goto nomem; conf->window = buffs*(PAGE_SIZE>>9)/2; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* also remove bitmap if not indicated */ + if (! MD_SB_BITMAP_REPAIR(mddev->sb)) { + /* has to be outside spinlock as it takes it */ + printk(KERN_WARNING "md%d: removed bitmap %x\n", + mdidx(mddev), (unsigned)bitmap); + bitmap->stop (bitmap); + } else { + printk(KERN_WARNING "md%d: retained bitmap %x\n", + mdidx(mddev), (unsigned)bitmap); + } + /* reset the bitmap indicator always */ + MD_SB_BITMAP_REPAIR(mddev->sb) = 0; +#endif /* CONFIG_MD_FR1 */ } spin_lock_irq(&conf->segment_lock); if (!sector_nr) { +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* setup extra report counters for skipped/synced blocks */ + conf->sync_mode = -1; + conf->last_clean_sector = -1; + conf->last_dirty_sector = -1; +#endif /* CONFIG_MD_FR1 */ /* initialize ...*/ conf->start_active = 0; conf->start_ready = 0; @@ -1422,7 +1904,7 @@ MD_BUG(); } while (sector_nr >= conf->start_pending) { - PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + PRINTK("wait .. sect=%lu start_active=%ld ready=%ld pending=%ld future=%ld, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); wait_event_lock_irq(conf->wait_done, @@ -1463,9 +1948,71 @@ conf->last_used = disk; mirror = conf->mirrors+conf->last_used; + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* PTB go looking for the faulted (nonoperational) mirrors, under lock */ + count = 0; + while (1) { + const int maxdisk = 2 * conf->raid_disks - conf->working_disks; + if (disk <= 0) + disk = maxdisk > MD_SB_DISKS ? MD_SB_DISKS : maxdisk; + disk--; + if (disk == conf->last_used) + break; + if (!conf->mirrors[disk].operational) + continue; + /* We need them to be writable */ + if (conf->mirrors[disk].write_only) { + targets[count++] = disk; + } + } + + bitmap = conf->bitmap; + block_not_dirty = bitmap->active(bitmap) + && !bitmap->testbits(bitmap, (bitmap_offset_t) (sector_nr >> 1), 1); +#endif /* CONFIG_MD_FR1 */ + dev = mirror->dev; spin_unlock_irq(&conf->device_lock); - + +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + if (count > 0 && block_not_dirty) { + + const int done = 2 - (sector_nr & 1); + + md_sync_acct(mirror->dev, done); + sync_request_done(sector_nr, conf); + md_done_sync(mddev, done, 1); + + /* do these conf accesses under lock, though only accounting */ + spin_lock_irq(&conf->segment_lock); + if (conf->sync_mode != 0) { + if (conf->sync_mode == 1) { + printk(KERN_INFO "raid1: synced dirty sectors %lu-%lu\n", + conf->last_clean_sector+1, + conf->last_dirty_sector); + } + conf->sync_mode = 0; + } + + conf->last_clean_sector = sector_nr + done - 1; + if (mddev->sb && sector_nr + done >= mddev->sb->size<<1) { + printk(KERN_INFO "raid1: skipped clean sectors %lu-%lu\n", + conf->last_dirty_sector+1, + conf->last_clean_sector); + } + + /* PTB here be dragons - update md driver throttle discount */ + atomic_add(done, &md_throttle[mdidx(mddev)]); + spin_unlock_irq(&conf->segment_lock); + + wake_up(&conf->wait_ready); + /* skip remainder of block */ + return done; + } + + /* read */ +#endif /* CONFIG_MD_FR1 */ r1_bh = raid1_alloc_buf (conf); r1_bh->master_bh = NULL; r1_bh->mddev = mddev; @@ -1499,6 +2046,30 @@ generic_make_request(READ, bh); md_sync_acct(bh->b_dev, bh->b_size/512); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* printout info from time to time */ + spin_lock_irq(&conf->segment_lock); + if (conf->sync_mode != 1) { + if (conf->sync_mode == 0) { + printk(KERN_INFO "raid1: skipped clean sectors %lu-%lu\n", + conf->last_dirty_sector+1, + conf->last_clean_sector); + + + } + conf->sync_mode = 1; + } + conf->last_dirty_sector = sector_nr + (bsize >> 9) - 1; + + if (mddev->sb && sector_nr + (bsize >> 9) >= mddev->sb->size<<1) { + printk(KERN_INFO "raid1: synced dirty sectors %lu-%lu\n", + conf->last_clean_sector+1, + conf->last_dirty_sector); + } + + spin_unlock_irq(&conf->segment_lock); +#endif /* CONFIG_MD_FR1 */ + return (bsize >> 9); nomem: @@ -1531,6 +2102,14 @@ mddev_t *mddev = r1_bh->mddev; unsigned long sect = bh->b_blocknr; int size = bh->b_size; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + raid1_conf_t * conf = mddev_to_conf(mddev); + struct bitmap * bitmap = conf->bitmap; + if (bitmap && bitmap->active(bitmap)) { + /* PTB clean the bitmap after resync */ + bitmap->clearbits(bitmap, (bitmap_offset_t)(sect >> 1), size >> 10); + } +#endif /* CONFIG_MD_FR1 */ raid1_free_buf(r1_bh); sync_request_done(sect, mddev_to_conf(mddev)); md_done_sync(mddev,size>>9, uptodate); @@ -1576,6 +2155,11 @@ #define START_RESYNC KERN_WARNING \ "raid1: raid set md%d not clean; reconstructing mirrors\n" +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +#define BITMAP_ERROR KERN_ERR \ +"raid1: out of memory for bitmap on md%d\n" +#endif /* CONFIG_MD_FR1 */ + static int raid1_run (mddev_t *mddev) { raid1_conf_t *conf; @@ -1744,6 +2328,16 @@ /* nothing */; conf->last_used = j; +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + /* make the bitmap at this point - hope mddev->size exists already */ + if (raid1_create_bitmap(mddev) < 0) { + printk(BITMAP_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + /* set it active too */ + raid1_start_bitmap (mddev); +#endif /* CONFIG_MD_FR1 */ { @@ -1803,6 +2397,9 @@ raid1_shrink_r1bh(conf); raid1_shrink_bh(conf); raid1_shrink_buffers(conf); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + raid1_remove_bitmap (mddev); +#endif /* CONFIG_MD_FR1 */ kfree(conf); mddev->private = NULL; out: @@ -1864,6 +2461,9 @@ raid1_shrink_r1bh(conf); raid1_shrink_bh(conf); raid1_shrink_buffers(conf); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) + raid1_remove_bitmap (mddev); +#endif /* CONFIG_MD_FR1 */ kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; @@ -1896,4 +2496,8 @@ module_init(raid1_init); module_exit(raid1_exit); +#if defined(CONFIG_MD_FR1) || defined(CONFIG_MD_FR1_MODULE) +MODULE_PARM(async, "i"); +MODULE_PARM_DESC(async, "Do async writes"); +#endif /* CONFIG_MD_FR1 */ MODULE_LICENSE("GPL");