--- linux-2.6.11.6/drivers/md/raid1.c.orig Sat Mar 26 04:28:21 2005 +++ linux-2.6.11.6/drivers/md/raid1.c Sun Jun 12 00:32:54 2005 @@ -20,6 +20,30 @@ * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Changes by Peter T. Breuer 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap attached on setfaulty (mark bad) + * - bitmap marked during normal i/o if faulty disk + * - bitmap used to skip nondirty blocks during sync + * - bitmap removed on set active + * + * Minor changes are needed in raid1.h (extra fields in conf) and in + * md.c (support hotadd directly after setfaulty, or disk recognition). + * + * Changes by PTB 10/8/2004 to redo read-balancing so that it reads + * from the fastest disk, as determined by latency testing every so + * often. + * Changes by PTB 6/1/2005 to make read errors not fault the disk out + * of the array but cause retries instead. And also (with CORRECT set) + * trigger rewrite of the bad sector. + * Changes by PTB 15/3/2005 to keep rdev from being kfreed in + * export_rdev in md.c and instead free it here during replacement of the rdev + * in add_disk. Otherwise we would trace along a freed struct to see if + * it represents the dev we are interested in replacing. Thanks to + * Denis Bonnenfant (denis DOT bonnenfant AT diderot DOT org) for finding + * this and several other associated problems in the 2.6.8.1 port. */ #include @@ -29,6 +53,22 @@ */ #define NR_RAID1_BIOS 256 +#ifdef CONFIG_MD_FR1 +/* + * When to consider switching read disks: + */ +#define MAX_WORK_PER_DISK (128 * 8) +/* + * Weightings for calculating latency: + */ +#define MAX_TEST_PER_DISK 64 +#define LATENCY_OLD_WEIGHT 9 +#define LATENCY_NEW_WEIGHT 1 +#define LATENCY_SUM_WEIGHT (LATENCY_OLD_WEIGHT + LATENCY_NEW_WEIGHT) + +#include "bitmap.h" +#endif /* CONFIG_MD_FR1 */ + static mdk_personality_t raid1_personality; static void unplug_slaves(mddev_t *mddev); @@ -182,6 +222,60 @@ spin_unlock_irqrestore(&conf->resync_lock, flags); } +static int +map (mddev_t * mddev, mdk_rdev_t ** rdevp) +{ + conf_t *conf = mddev_to_conf (mddev); + int i, disks = conf->raid_disks; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + spin_lock_irq (&conf->device_lock); +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Uh, no. Choose the next disk if we can, not the first. + */ + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].rdev == *rdevp) { + i++; + break; + } + } + if (i >= disks) + i = 0; + for (; i < disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && rdev != *rdevp && rdev->in_sync) { + *rdevp = rdev; + atomic_inc (&rdev->nr_pending); + spin_unlock_irq (&conf->device_lock); + return i; + } + } + /* + * If for some reason we found nothing, dropthru and use the old + * routine. + */ +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + for (i = 0; i < disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && rdev->in_sync) { + *rdevp = rdev; + atomic_inc (&rdev->nr_pending); + spin_unlock_irq (&conf->device_lock); + return i; + } + } + spin_unlock_irq (&conf->device_lock); + + printk (KERN_ERR + "raid1_map(): huh, no more operational devices?\n"); + return -1; +} + static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; @@ -203,9 +297,51 @@ static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; +#ifdef CONFIG_MD_FR1 + /* + * calculate latency on reads, and fold into rolling average + * under lock. + * + * on writes, clear the bitmap if all disks were written + */ + int uptodate = test_bit(R1BIO_Uptodate, &r1_bio->state); + conf_t *conf = mddev_to_conf(r1_bio->mddev); + /* if we should mark the bitmap clean, do so */ + if (uptodate && bio_data_dir(bio) == WRITE + && r1_bio->nonoperational <= 0) { + struct bitmap * bitmap = conf->bitmap; + if (bitmap && bitmap->active(bitmap)) { + bitmap->clearbits(bitmap, + bio->bi_sector >> 1, bio->bi_size >> 10); + } + } + /* calculate the latency of the read device */ + if (uptodate && (bio_data_dir(bio) == READ + || bio_data_dir(bio) == READA)) { + unsigned long latency = jiffies - r1_bio->start_jiffies; + /* find the mirror component being read */ + int mirror = r1_bio->read_disk; + + if (latency < 120 * HZ && latency >= 0) { + /* count in 1/10ths if we have total weights 9+1 = 10 */ + latency *= LATENCY_SUM_WEIGHT * LATENCY_SUM_WEIGHT; + spin_lock_irq(&conf->device_lock); + conf->latency[mirror] *= LATENCY_OLD_WEIGHT; + conf->latency[mirror] += LATENCY_NEW_WEIGHT * latency; + conf->latency[mirror] /= LATENCY_SUM_WEIGHT; + spin_unlock_irq(&conf->device_lock); + } else { + printk(KERN_ERR + "raid1: bad latency %lu jiffies on disk %d\n", + latency, mirror); + } + } + bio_endio(bio, bio->bi_size, uptodate ? 0 : -EIO); +#else bio_endio(bio, bio->bi_size, test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); +#endif /* CONFIG_MD_FR1 */ free_r1bio(r1_bio); } @@ -234,9 +370,19 @@ /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) + if (!uptodate) { +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Only fault disk out of array on write error, not read. + */ + if (0) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else +#ifdef DO_ADD_READ_WRITE_CORRECT + else /* tell next time we're here that we're a retry */ + set_bit(R1BIO_ReadRetry, &r1_bio->state); +#endif /* DO_ADD_READ_WRITE_CORRECT */ + } else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -253,7 +399,19 @@ /* * we have only one bio on the read side */ - if (uptodate) + if (uptodate +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* Give up and error if we're last */ + || (atomic_dec_and_test(&r1_bio->remaining)) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + ) +#ifdef DO_ADD_READ_WRITE_CORRECT + if (uptodate && test_bit(R1BIO_ReadRetry, &r1_bio->state)) { + /* Success at last - rewrite failed reads */ + set_bit(R1BIO_IsSync, &r1_bio->state); + reschedule_retry(r1_bio); + } else +#endif /* DO_ADD_READ_WRITE_CORRECT */ raid_end_bio_io(r1_bio); else { /* @@ -381,12 +539,22 @@ */ if (conf->next_seq_sect == this_sector) goto rb_out; +#ifdef CONFIG_MD_FR1 + /* + * Make slower disks appear more distant. + */ + current_distance = abs(this_sector - conf->mirrors[disk].head_position) + * conf->latency[disk]; + + /* Find the disk whose head is closest (weighting by latency) */ +#else if (this_sector == conf->mirrors[new_disk].head_position) goto rb_out; current_distance = abs(this_sector - conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ +#endif /* CONFIG_MD_FR1 */ do { if (disk <= 0) @@ -543,6 +730,9 @@ r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; +#ifdef CONFIG_MD_FR1 + r1_bio->start_jiffies = jiffies; +#endif /* CONFIG_MD_FR1 */ r1_bio->state = 0; @@ -570,6 +760,19 @@ read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ; read_bio->bi_private = r1_bio; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + atomic_set(&r1_bio->remaining, 0); + /* count source devices under spinlock */ + spin_lock_irq(&conf->device_lock); + disks = conf->raid_disks; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].rdev && + !conf->mirrors[i].rdev->faulty) { + atomic_inc(&r1_bio->remaining); + } + } + spin_unlock_irq(&conf->device_lock); +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ generic_make_request(read_bio); return 0; @@ -636,6 +871,89 @@ seq_printf(seq, "]"); } +#ifdef CONFIG_MD_FR1 +/* + * Local bitmap support functions. + */ +static int +create_bitmap(conf_t *conf) { + + struct bitmap * bitmap; + unsigned long blocks; + mddev_t *mddev = conf->mddev; + + /* need size to have been set already */ + blocks = mddev->size; + + bitmap = kmalloc (sizeof (*bitmap), GFP_KERNEL); + if (!bitmap) { + printk(KERN_WARNING "raid1: out of memory for bitmap head\n"); + return -ENOMEM; + } + + if (bitmap_init (bitmap, blocks) < 0) { + printk(KERN_WARNING "raid1: failed to init bitmap\n"); + kfree(bitmap); + return -ENOMEM; + } + + /* take the spinlock for the ops on the configuration */ + spin_lock_irq(&conf->device_lock); + conf->bitmap = bitmap; + conf->bitmap_dirty = 0; + spin_unlock_irq(&conf->device_lock); + return 0; +} + +static void +remove_bitmap (conf_t *conf) { + + struct bitmap * bitmap; + + spin_lock_irq(&conf->device_lock); + bitmap = conf->bitmap; + if (!bitmap) { + spin_unlock_irq(&conf->device_lock); + return; + } + conf->bitmap = NULL; + spin_unlock_irq(&conf->device_lock); + + bitmap_destr(bitmap); + kfree(bitmap); +} + +static int +start_bitmap (conf_t *conf) { + + mddev_t *mddev; + struct bitmap * bitmap; + + spin_lock_irq(&conf->device_lock); + mddev = conf->mddev; + bitmap = conf->bitmap; + spin_unlock_irq(&conf->device_lock); + if (!bitmap) { + return -EINVAL; + } + + if (bitmap->active(bitmap)) { + printk(KERN_WARNING "raid1: bitmap %x already active!\n", + (unsigned) bitmap); + return 0; + } + if (bitmap->start(bitmap, mddev->events) < 0) { + printk(KERN_WARNING "raid1: bitmap %x failed to start!\n", + (unsigned) bitmap); + return -EINVAL; + } + + printk(KERN_INFO "raid1: made bitmap %x at events %x:%x\n", + (unsigned) bitmap, (unsigned)(u32)(mddev->events >> 32), + (unsigned)(u32)(mddev->events)); + return 0; +} +#endif /* CONFIG_MD_FR1 */ static void error(mddev_t *mddev, mdk_rdev_t *rdev) { @@ -669,6 +987,9 @@ printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", bdevname(rdev->bdev,b), conf->working_disks); +#ifdef CONFIG_MD_FR1 + start_bitmap(conf); +#endif /* CONFIG_MD_FR1 */ } static void print_conf(conf_t *conf) @@ -687,6 +1008,14 @@ for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; +#ifdef CONFIG_MD_FR1 + /* + * Remove repeats from debug printout. + */ + if (i > 0 && memcmp(tmp, &conf->mirrors[i-1], sizeof(*tmp)) == 0) { + continue; + } +#endif /* CONFIG_MD_FR1 */ if (tmp->rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", i, !tmp->rdev->in_sync, !tmp->rdev->faulty, @@ -706,6 +1035,14 @@ mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + +#ifdef CONFIG_MD_FR1 + if (conf->bitmap) { + struct bitmap *bitmap = conf->bitmap; + bitmap->stop(bitmap); + bitmap->print_stats(bitmap); + } +#endif /* CONFIG_MD_FR1 */ } static int raid1_spare_active(mddev_t *mddev) @@ -741,9 +1078,42 @@ int mirror; mirror_info_t *p; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + for (mirror=0; mirror < mddev->raid_disks; mirror++) { +#ifdef CONFIG_MD_FR1 + /* + * allow a disk which has only been set faulty but not + * removed yet to be reinserted, thus triggering a hot + * repair. + */ + p = &conf->mirrors[mirror]; + if (unlikely(!p->rdev)) + goto insert_or_replace; + printk(KERN_DEBUG "raid1: testing p->rdev %p\n", p->rdev); + if (unlikely(p->rdev == rdev)) + goto insert_or_replace; + printk(KERN_DEBUG "raid1: testing p->rdev->bdev %p\n", + p->rdev->bdev); + if (!p->rdev->bdev) + goto insert_or_replace; // weird! + printk(KERN_DEBUG "raid1: testing p->rdev->bdev->bd_dev %x\n", + p->rdev->bdev->bd_dev); + if (p->rdev->bdev->bd_dev == rdev->bdev->bd_dev) + goto insert_or_replace; + continue; + +insert_or_replace: + if (1) { + if (p->rdev && p->rdev != rdev) { + /* kill the rdev left by export_rdev() */ + printk(KERN_INFO + "raid1: late free of exported rdev %p\n", + p->rdev); + kfree(p->rdev); + } + p->rdev = rdev; +#else if ( !(p=conf->mirrors+mirror)->rdev) { - +#endif /* CONFIG_MD_FR1 */ blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk @@ -760,7 +1130,7 @@ p->rdev = rdev; break; } - + } print_conf(conf); return found; } @@ -844,6 +1214,14 @@ update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { +#ifdef CONFIG_MD_FR1 + /* clean the bitmap after resync */ + struct bitmap * bitmap = conf->bitmap; + if (bitmap && bitmap->active(bitmap)) { + bitmap->clearbits(bitmap, r1_bio->sector >> 1, + r1_bio->sectors >> 1); + } +#endif /* CONFIG_MD_FR1 */ md_done_sync(mddev, r1_bio->sectors, uptodate); put_buf(r1_bio); } @@ -934,7 +1312,10 @@ } else { int disk; bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=read_balance(conf, r1_bio)) == -1) { +#ifdef CONFIG_MD_RAID1_ROBUST_READ + rdev = conf->mirrors[r1_bio->read_disk].rdev; +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + if ((disk=map(mddev, &rdev)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), @@ -1004,6 +1385,22 @@ int i; int write_targets = 0; +#ifdef CONFIG_MD_FR1 + /* + * Will need to count mirror components currently with a bitmap + * which have been marked faulty and nonoperational at some + * point beforehand, and have been accumulating marks on the + * bitmap to indicate dirty blocks that need syncing. + */ + struct bitmap * bitmap = conf->bitmap; + int count, block_not_dirty; + int targets[MD_SB_DISKS]; + /* + * discount the skipped sectors back to the md.c code + */ + extern atomic_t md_throttle[]; +#endif /* CONFIG_MD_FR1 */ + if (!conf->r1buf_pool) if (init_resync(conf)) return -ENOMEM; @@ -1014,6 +1411,25 @@ return 0; } +#ifdef CONFIG_MD_FR1 + /* also remove bitmap if not indicated */ + if (!sector_nr) { + if (! test_and_clear_bit(MD_BITMAP_REPAIR, &mddev->recovery)) { + /* has to be outside spinlock as it takes it */ + printk(KERN_WARNING + "%s: no repair bit on sb so removed bitmap %x\n", + mdname(mddev), (unsigned)bitmap); + if (bitmap) + bitmap->stop (bitmap); + } else { + printk(KERN_WARNING + "%s: repair bit set on sb so retained bitmap %x\n", + mdname(mddev), (unsigned)bitmap); + } + /* reset the bitmap indicator always */ + } +#endif /* CONFIG_MD_FR1 */ + /* * If there is non-resync activity waiting for us then * put in a delay to throttle resync. @@ -1029,6 +1445,34 @@ */ disk = conf->last_used; /* make sure disk is operational */ +#ifdef CONFIG_MD_FR1 + /* setup extra report counters for skipped/synced blocks */ + if (!sector_nr) { + conf->sync_mode = -1; + conf->last_clean_sector = -1; + conf->last_dirty_sector = -1; + } + + nr_sectors = RESYNC_SECTORS; + if (max_sector - sector_nr < nr_sectors) + nr_sectors = max_sector - sector_nr; + + /* go looking for the faulted (nonoperational) mirrors, under lock */ + count = 0; + while(1) { + if (disk <= 0) + disk = conf->raid_disks; + disk--; + if (conf->mirrors[disk].rdev + && !conf->mirrors[disk].rdev->faulty + && (!conf->mirrors[disk].rdev->in_sync || + sector_nr + nr_sectors > mddev->recovery_cp)) { + targets[count++] = disk; + } + if (disk == conf->last_used) + break; + } +#endif /* CONFIG_MD_FR1 */ while (conf->mirrors[disk].rdev == NULL || !conf->mirrors[disk].rdev->in_sync) { @@ -1043,6 +1487,57 @@ mirror = conf->mirrors + disk; +#ifdef CONFIG_MD_FR1 + /* + * check if bitmap says reync block can be skipped, and do so + */ + block_not_dirty = bitmap->active(bitmap) + && !bitmap->testbits(bitmap, sector_nr >> 1, nr_sectors >> 1); + + if (count > 0 && block_not_dirty) { + /* skip */ + + md_sync_acct(mirror->rdev->bdev, nr_sectors); + // sync_request_done(sector_nr, conf); + md_done_sync(mddev, nr_sectors, 1); + + for (i = 0; i < count; i++) { + int mirror = targets[i]; + atomic_dec(&conf->mirrors[mirror].rdev->nr_pending); + } + + /* do these conf accesses under lock, though only accounting */ + spin_lock_irq(&conf->resync_lock); + if (conf->sync_mode != 0) { + if (conf->sync_mode == 1) { + printk(KERN_INFO "raid1: synced dirty sectors %lu-%lu\n", + conf->last_clean_sector+1, + conf->last_dirty_sector); + } + conf->sync_mode = 0; + } + conf->last_clean_sector = sector_nr + nr_sectors - 1; + if (sector_nr + nr_sectors >= mddev->size << 1) { + printk(KERN_INFO "raid1: skipped clean sectors %lu-%lu\n", + conf->last_dirty_sector+1, + conf->last_clean_sector); + } + /* update md driver throttle discount */ + atomic_add(nr_sectors, &md_throttle[mddev->md_minor]); + + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + --conf->barrier; + wake_up(&conf->wait_idle); + wake_up(&conf->wait_resume); + spin_unlock_irq(&conf->resync_lock); + + md_wakeup_thread(mddev->thread); + return nr_sectors; + } +#endif /* CONFIG_MD_FR1 */ r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); @@ -1136,6 +1631,28 @@ generic_make_request(bio); +#ifdef CONFIG_MD_FR1 + /* printout info from time to time */ + spin_lock_irq(&conf->resync_lock); + if (conf->sync_mode != 1) { + if (conf->sync_mode == 0) { + printk(KERN_INFO + "raid1: skipped clean sectors %lu-%lu\n", + conf->last_dirty_sector+1, + conf->last_clean_sector); + } + conf->sync_mode = 1; + } + conf->last_dirty_sector = sector_nr + nr_sectors - 1; + + if (sector_nr + nr_sectors >= mddev->size << 1) { + printk(KERN_INFO "raid1: synced dirty sectors %lu-%lu\n", + conf->last_clean_sector+1, + conf->last_dirty_sector); + } + spin_unlock_irq(&conf->resync_lock); +#endif /* CONFIG_MD_FR1 */ + return nr_sectors; } @@ -1247,6 +1764,17 @@ conf->last_used = j; +#ifdef CONFIG_MD_FR1 + /* make the bitmap now - hope mddev->size exists already */ + if (create_bitmap(conf) < 0) { + printk(KERN_ERR "raid1: out of memory for bitmap on %s\n", + mdname(mddev)); + goto out_free_conf; + } + + /* PTB set it active too */ + start_bitmap (conf); +#endif /* CONFIG_MD_FR1 */ { mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); @@ -1280,6 +1808,9 @@ kfree(conf->mirrors); if (conf->poolinfo) kfree(conf->poolinfo); +#ifdef CONFIG_MD_FR1 + remove_bitmap (conf); +#endif /* CONFIG_MD_FR1 */ kfree(conf); mddev->private = NULL; } @@ -1300,6 +1831,9 @@ kfree(conf->mirrors); if (conf->poolinfo) kfree(conf->poolinfo); +#ifdef CONFIG_MD_FR1 + remove_bitmap (conf); +#endif /* CONFIG_MD_FR1 */ kfree(conf); mddev->private = NULL; return 0;