--- linux-2.6.11.6/drivers/md/md.c.orig Sat Mar 26 04:28:17 2005 +++ linux-2.6.11.6/drivers/md/md.c Sun Jun 12 09:09:05 2005 @@ -27,6 +27,27 @@ You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Changes 31/1/2003 by Peter T. Breuer to support + hotadd directly after setfaulty without intervening hotremove + ("hotrepair") when there is no persistent superblock, and to flag a + potential hotrepair when an old disk is re-added and the uuid matches + ours. The flag is used by the raid1 driver, at the moment, in order + to trigger an intelligent resync. + + Yet more changes by PTB 12/3/2003 to notify devices via ioctls when + they have been incorporated or removed from a raid array. + + Yet more changes by PTB 26/3/2004 to make the speed calculations + appropriate to fr1, and throttle by real i/o, not resync total. + + Changes by PTB 15/3/2005 to make sure the sb of the rdev is read + just before we check its uuid. + + Changes by PTB 15/3/2005 to keep rdev from being kfreed in + export_rdev c and instead free it in raid1 during + pers->add_disk. Otherwise we would trace along a freed struct there + to see if it represents a dev we are interested in repairing. */ #include @@ -55,6 +76,7 @@ #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) +#define MD_BITMAP_SUPPORT 1 #ifndef MODULE static void autostart_arrays (int part); @@ -122,6 +144,14 @@ { .ctl_name = 0 } }; +#ifdef MD_BITMAP_SUPPORT +/* PTB md_throttle permits speed calculation adjustments from personality */ +#ifdef MODULE +static +#endif /* MODULE */ +atomic_t md_throttle[MAX_MD_DEVS]; +#endif /* MD_BITMAP_SUPPORT */ + static struct block_device_operations md_fops; /* @@ -173,7 +203,14 @@ return; if (!mddev->raid_disks && list_empty(&mddev->disks)) { list_del(&mddev->all_mddevs); +#ifdef MD_BITMAP_SUPPORT + spin_unlock(&all_mddevs_lock); + /* blk_put_queue calls kblockd_flush, which can sleep */ +#endif /* MD_BITMAP_SUPPORT */ blk_put_queue(mddev->queue); +#ifdef MD_BITMAP_SUPPORT + spin_lock(&all_mddevs_lock); +#endif /* MD_BITMAP_SUPPORT */ kfree(mddev); } spin_unlock(&all_mddevs_lock); @@ -355,6 +392,10 @@ static int read_disk_sb(mdk_rdev_t * rdev) { char b[BDEVNAME_SIZE]; +#ifdef MD_BITMAP_SUPPORT + mdp_super_t *sb; +#endif /* MD_BITMAP_SUPPORT */ + if (!rdev->sb_page) { MD_BUG(); return -EINVAL; @@ -366,6 +407,11 @@ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; +#ifdef MD_BITMAP_SUPPORT + sb = (mdp_super_t *)page_address(rdev->sb_page); + printk(KERN_INFO "%s (read) [events: %08lx]\n", + bdevname(rdev->bdev,b), (unsigned long)sb->events_lo); +#endif /* MD_BITMAP_SUPPORT */ return 0; fail: @@ -582,6 +628,9 @@ mddev->raid_disks = sb->raid_disks; mddev->size = sb->size; mddev->events = md_event(sb); +#ifdef MD_BITMAP_SUPPORT + mddev->bitmap_events = MD_SB_BITMAP_EVENTS(sb); +#endif /* MD_BITMAP_SUPPORT */ if (sb->state & (1<recovery_cp = MaxSector; @@ -669,6 +718,10 @@ sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; +#ifdef MD_BITMAP_SUPPORT + MD_SB_BITMAP_EVENTS_HI(sb) = (u32)(mddev->bitmap_events >> 32); + MD_SB_BITMAP_EVENTS_LO(sb) = (u32)mddev->bitmap_events; +#endif /* MD_BITMAP_SUPPORT */ if (mddev->in_sync) { @@ -872,6 +925,11 @@ mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->size = le64_to_cpu(sb->size)/2; mddev->events = le64_to_cpu(sb->events); +#ifdef MD_BITMAP_SUPPORT + mddev->bitmap_events = + (((__u64)le32_to_cpu(MD_SB_BITMAP_EVENTS_HI_1(sb)))<<32) + | (__u64)le32_to_cpu(MD_SB_BITMAP_EVENTS_LO_1(sb)); +#endif /* MD_BITMAP_SUPPORT */ mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); @@ -928,6 +986,10 @@ sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); +#ifdef MD_BITMAP_SUPPORT + MD_SB_BITMAP_EVENTS_HI_1(sb) = (u32)cpu_to_le32(mddev->bitmap_events >> 32); + MD_SB_BITMAP_EVENTS_LO_1(sb) = (u32)cpu_to_le32(mddev->bitmap_events); +#endif /* MD_BITMAP_SUPPORT */ if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); else @@ -998,6 +1060,91 @@ return 0; } +#ifdef MD_BITMAP_SUPPORT +static mdk_rdev_t * +find_rdev_all (dev_t dev) +{ + struct list_head *tmp; + mddev_t *mddev; + static mdk_rdev_t * find_rdev(mddev_t *mddev, dev_t dev); + + ITERATE_MDDEV(mddev, tmp) { + mdk_rdev_t *rdev = find_rdev(mddev, dev); + if (rdev) + return rdev; + } + return NULL; +} + +/* + * This is registered to other devices as a callback + */ +static int +md_hot_cmd_disk(dev_t dev, int cmd) { + + static int hot_add_disk(mddev_t * mddev, dev_t dev); + static int set_disk_faulty(mddev_t *mddev, dev_t dev); + + mdk_rdev_t *rdev; + mddev_t *mddev; + int res; + + rdev = find_rdev_all(dev); + if (!rdev) + return -EINVAL; + mddev = rdev->mddev; + if (!mddev) + return -EINVAL; + + switch(cmd) { + case HOT_ADD_DISK: + res = hot_add_disk(mddev, dev); + return res; + case SET_DISK_FAULTY: + res = set_disk_faulty(mddev, dev); + return res; + } + return -EINVAL; +} + +static void +notify_device (mddev_t * mddev, dev_t dev) +{ +#ifndef BLKMDNTFY +#define BLKMDNTFY _IOW(0x12,133,int) +#endif + struct block_device *bdev; + + bdev = bdget (dev); + if (!bdev) + return; + printk (KERN_INFO "%s: notifying dev %x it is now in array\n", + mdname(mddev), dev); + ioctl_by_bdev (bdev, BLKMDNTFY, MKDEV (MD_MAJOR, mddev->md_minor)); +#ifndef BLKMDRGTR +#define BLKMDRGTR _IOW(0x12,135,unsigned long) +#endif + ioctl_by_bdev (bdev, BLKMDRGTR, (unsigned long)md_hot_cmd_disk); + bdput(bdev); +} +static void +unnotify_device (mddev_t * mddev, dev_t dev) +{ +#ifndef BLKMDUNTFY +#define BLKMDUNTFY _IOW(0x12,134,int) +#endif + struct block_device *bdev; + + bdev = bdget (dev); + if (!bdev) + return; + printk (KERN_INFO "%s: notifying dev %x it is no longer in array\n", + mdname(mddev), dev); + ioctl_by_bdev(bdev, BLKMDUNTFY, MKDEV(MD_MAJOR, mddev->md_minor)); + bdput(bdev); +} +#endif /* MD_BITMAP_SUPPORT */ + static LIST_HEAD(pending_raid_disks); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) @@ -1036,6 +1183,9 @@ list_add(&rdev->same_set, &mddev->disks); rdev->mddev = mddev; printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); +#ifdef MD_BITMAP_SUPPORT + notify_device(mddev, rdev->bdev->bd_inode->i_rdev); +#endif /* MD_BITMAP_SUPPORT */ return 0; } @@ -1046,6 +1196,9 @@ MD_BUG(); return; } +#ifdef MD_BITMAP_SUPPORT + unnotify_device(rdev->mddev, rdev->bdev->bd_inode->i_rdev); +#endif /* MD_BITMAP_SUPPORT */ list_del_init(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; @@ -1075,6 +1228,10 @@ blkdev_put(bdev); return err; } +#ifdef MD_BITMAP_SUPPORT + printk(KERN_DEBUG "md %d: %s restores bdev %x rdev(%p)->bdev(%p)\n", + __LINE__, __FUNCTION__, dev, rdev, bdev); +#endif /* MD_BITMAP_SUPPORT */ rdev->bdev = bdev; return err; } @@ -1082,6 +1239,9 @@ static void unlock_rdev(mdk_rdev_t *rdev) { struct block_device *bdev = rdev->bdev; +#ifdef MD_BITMAP_SUPPORT + printk(KERN_DEBUG "md %d: %s nulls bdev\n", __LINE__, __FUNCTION__); +#endif /* MD_BITMAP_SUPPORT */ rdev->bdev = NULL; if (!bdev) MD_BUG(); @@ -1104,6 +1264,16 @@ md_autodetect_dev(rdev->bdev->bd_dev); #endif unlock_rdev(rdev); +#ifdef MD_BITMAP_SUPPORT +#ifdef CONFIG_MD_FR1 + /* FIXME only kfree if pers hot_add does not kfree instead. + * Use some flag in future! + */ + printk(KERN_WARNING + "md: WARNING: delaying free of exported rdev %p\n", rdev); + if (0) +#endif /* CONFIG_MD_FR1 */ +#endif /* MD_BITMAP_SUPPORT */ kfree(rdev); } @@ -2181,6 +2351,10 @@ int err; unsigned int size; mdk_rdev_t *rdev; +#ifdef MD_BITMAP_SUPPORT + int hotrepair = 0; + mdp_super_t *sb; +#endif /* MD_BITMAP_SUPPORT */ if (!mddev->pers) return -ENODEV; @@ -2198,7 +2372,42 @@ return -EINVAL; } +#ifdef MD_BITMAP_SUPPORT + /* + * This is a do at most once loop because the remove in the loop will + * cause the test to fail the next time round. And if that + * doesn't break us out, then the hotrepair count will. + */ + rdev = find_rdev(mddev, dev); + if (rdev) { + int mirror; + + /* found it in array, so it's not yet been removed */ + if (rdev->bdev->bd_inode->i_rdev != dev + || !rdev->faulty) { + printk(KERN_WARNING "%s: cannot add existing component %x\n", + mdname(mddev), dev); + return -EBUSY; + } + /* + * Allow "hotrepair" of faulty device. Have rdev->faulty; + */ + printk(KERN_WARNING "%s: repair of faulty disk %x!\n", + mdname(mddev), dev); + + mirror = rdev->raid_disk; + rdev->raid_disk = -1; + err = hot_remove_disk(mddev, dev); + if (err < 0) { + printk(KERN_WARNING "%s: remove disk %x errored\n", + mdname(mddev), dev); + return err; + } + } + rdev = md_import_device (dev, 0, 0); // PTB -1 == don't check sb +#else rdev = md_import_device (dev, -1, 0); +#endif /* MD_BITMAP_SUPPORT */ if (IS_ERR(rdev)) { printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", @@ -2231,6 +2440,71 @@ err = -EINVAL; goto abort_export; } + +#ifdef MD_BITMAP_SUPPORT + printk(KERN_INFO "md: old uuid %x %x %x %x\n", + *(__u32 *)(mddev->uuid+0), + *(__u32 *)(mddev->uuid+4), + *(__u32 *)(mddev->uuid+8), + *(__u32 *)(mddev->uuid+12)); + /* get the component's superblock */ + if (!rdev->sb_loaded) + read_disk_sb(rdev); + sb = (mdp_super_t *)page_address(rdev->sb_page); + if (sb) { + printk(KERN_INFO "md: new uuid %x %x %x %x\n", + sb->set_uuid0, + sb->set_uuid1, + sb->set_uuid2, + sb->set_uuid3); + } else { + printk(KERN_INFO "md: new component device has no sb\n"); + } + /* let's check the new disk sb at this poimt */ + if (mddev->persistent && sb + && sb->set_uuid0 == *(__u32 *)(mddev->uuid+0) + && sb->set_uuid1 == *(__u32 *)(mddev->uuid+4) + && sb->set_uuid2 == *(__u32 *)(mddev->uuid+8) + && sb->set_uuid3 == *(__u32 *)(mddev->uuid+12)) { + long long disk_events, bitmap_events; + disk_events = sb->events_hi; + disk_events <<= 32; + disk_events |= sb->events_lo; + bitmap_events = mddev->bitmap_events; + + /* This is where we should examine conf->events_chkpt_* + */ + if (disk_events < bitmap_events + && disk_events >= bitmap_events - 2 + ) { + printk(KERN_WARNING + "%s: warning - new disk %x nearly too old for repair (disk %Ld < bitmap %Ld)\n", + mdname(mddev), dev, disk_events, bitmap_events); + } + if (disk_events < bitmap_events - 2) { + /* new disk is too old! */ + printk(KERN_INFO + "%s: new disk %x too old for repair (disk %Ld < bitmap %Ld)\n", + mdname(mddev), dev, disk_events, bitmap_events); + hotrepair = 0; + } else { + printk(KERN_INFO + "%s: repairing old mirror component %x (disk %Ld >= bitmap %Ld)\n", + mdname(mddev), dev, disk_events, bitmap_events); + hotrepair = 1; + } + } else if (!mddev->persistent && hotrepair) { + printk(KERN_INFO + "md: forced repair of mirror component %x\n", + dev); + hotrepair = 1; + } else { + /* failed match */ + printk(KERN_INFO "md: adding new mirror component %x\n", dev); + hotrepair = 0; + } +#endif /* MD_BITMAP_SUPPORT */ + rdev->in_sync = 0; rdev->desc_nr = -1; bind_rdev_to_array(rdev, mddev); @@ -2247,6 +2521,21 @@ goto abort_unbind_export; } +#ifdef MD_BITMAP_SUPPORT + /* + * Maybe say something nice - 1 means we want to respect + * the bitmap in raid1 resync if there is one, 0 + * means we need to kill any bitmap that we have been + * saving but we'll do it in the raid1 resync instead of here + */ + printk(KERN_DEBUG "%s: set repair bit to %d on superblock %p\n", + mdname(mddev), hotrepair, mddev); + if (hotrepair) + set_bit (MD_BITMAP_REPAIR, &mddev->recovery); + else + clear_bit(MD_BITMAP_REPAIR, &mddev->recovery); +#endif /* MD_BITMAP_SUPPORT */ + rdev->raid_disk = -1; md_update_sb(mddev); @@ -3260,6 +3549,10 @@ mddev_t *mddev2; unsigned int currspeed = 0, window; +#ifdef MD_BITMAP_SUPPORT + /* PTB add realspeed for i/o limiting calculation */ + unsigned realspeed; +#endif /* MD_BITMAP_SUPPORT */ sector_t max_sectors,j; unsigned long mark[SYNC_MARKS]; sector_t mark_cnt[SYNC_MARKS]; @@ -3338,7 +3631,7 @@ /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; - printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); + printk(KERN_INFO "md: syncing RAID array %s)\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" " %d KB/sec/disc.\n", sysctl_speed_limit_min); printk(KERN_INFO "md: using maximum available idle IO bandwith " @@ -3368,6 +3661,9 @@ atomic_set(&mddev->recovery_active, 0); init_waitqueue_head(&mddev->recovery_wait); last_check = 0; +#ifdef MD_BITMAP_SUPPORT + atomic_set(&md_throttle[mddev->md_minor], 0); +#endif /* MD_BITMAP_SUPPORT */ if (j>2) { printk(KERN_INFO @@ -3404,6 +3700,10 @@ mddev->resync_mark = mark[next]; mddev->resync_mark_cnt = mark_cnt[next]; +#ifdef MD_BITMAP_SUPPORT + /* PTB reset count of skipped blocks this mark */ + atomic_set(&md_throttle[mddev->md_minor], 0); +#endif /* MD_BITMAP_SUPPORT */ mark[next] = jiffies; mark_cnt[next] = j - atomic_read(&mddev->recovery_active); last_mark = next; @@ -3433,10 +3733,23 @@ cond_resched(); currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; +#ifdef MD_BITMAP_SUPPORT + /* + * some of the blocks are skipped, not synced, so + * should not count when limiting i/o. Let personality say. + */ + realspeed = ((unsigned long)(j - mddev->resync_mark_cnt - atomic_read(&md_throttle[mddev->md_minor])))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; +#endif /* MD_BITMAP_SUPPORT */ if (currspeed > sysctl_speed_limit_min) { +#ifdef MD_BITMAP_SUPPORT + /* PTB use realspeed for upper limit on i/o */ + if ((realspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { +#else if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { +#endif /* MD_BITMAP_SUPPORT */ msleep_interruptible(250); goto repeat; } @@ -3534,15 +3847,35 @@ mddev->pers->spare_active(mddev); } md_update_sb(mddev); +#ifdef MD_BITMAP_SUPPORT + mddev->recovery &= 1 << MD_BITMAP_REPAIR; + + printk(KERN_ERR + "%s: md_check_recovery 1:" + " repair bit %lx on sb %p preserved\n", + mdname(mddev), mddev->recovery, mddev); +#else mddev->recovery = 0; +#endif /* MD_BITMAP_SUPPORT */ /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } if (mddev->recovery) /* probably just the RECOVERY_NEEDED flag */ +#ifdef MD_BITMAP_SUPPORT + { + mddev->recovery &= 1 << MD_BITMAP_REPAIR; + + printk(KERN_ERR + "%s: md_check_recovery 2:" + " repair bit %lx on sb %p preserved\n", + mdname(mddev), mddev->recovery, mddev); + } +#else mddev->recovery = 0; - + /* flag recovery needed just to double check */ +#endif /* MD_BITMAP_SUPPORT */ /* no recovery is running. * remove any failed drives, then * add spares if possible @@ -3584,7 +3917,16 @@ " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ +#ifdef MD_BITMAP_SUPPORT + mddev->recovery &= 1 << MD_BITMAP_REPAIR; + + printk(KERN_ERR + "%s: md_check_recovery 3:" + " repair bit %lx on sb %p preserved\n", + mdname(mddev), mddev->recovery, mddev); +#else mddev->recovery = 0; +#endif /* MD_BITMAP_SUPPORT */ } else { md_wakeup_thread(mddev->sync_thread); } @@ -3764,4 +4106,7 @@ EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_print_devices); EXPORT_SYMBOL(md_check_recovery); +#ifdef MD_BITMAP_SUPPORT +EXPORT_SYMBOL(md_throttle); +#endif /* MD_BITMAP_SUPPORT */ MODULE_LICENSE("GPL");