--- linux-2.4.30/include/linux/raid/raid1.h.pre-fr1 Sun Aug 12 21:39:02 2001 +++ linux-2.4.30/include/linux/raid/raid1.h Wed Apr 6 19:17:25 2005 @@ -59,6 +59,15 @@ md_wait_queue_head_t wait_done; md_wait_queue_head_t wait_ready; md_spinlock_t segment_lock; + + long last_clean_sector; /* helps debugging */ + long last_dirty_sector; + int sync_mode; /* clean/dirty in sync? */ + void *bitmap; /* the array bitmap */ + int bitmap_dirty; /* flag */ + int latency[MD_SB_DISKS]; + int last_source; /* PTB disk read from */ + }; typedef struct raid1_private_data raid1_conf_t; @@ -86,9 +95,19 @@ struct buffer_head *mirror_bh_list; struct buffer_head bh_req; struct raid1_bh *next_r1; /* next for retry or in free list */ + int nonoperational; /* no of bad mirror comps */ + unsigned long start_jiffies; /* PTB when i/o started */ }; /* bits for raid1_bh.state */ #define R1BH_Uptodate 1 #define R1BH_SyncPhase 2 #define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */ +#define R1BH_AsyncPhase 4 +#define R1BH_AsyncIO 5 +#ifdef CONFIG_MD_RAID1_ROBUST_READ +#define R1BH_ReadRetry 6 +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ +#ifdef CONFIG_MD_RAID1_READ_WRITE_CORRECT +#define R1BH_ReadRewrite 7 +#endif /* CONFIG_MD_RAID1_READ_WRITE_CORRECT */ #endif --- linux-2.4.30/include/linux/raid/md_p.h.pre-fr1 Tue Nov 14 22:16:37 2000 +++ linux-2.4.30/include/linux/raid/md_p.h Wed Apr 6 18:18:04 2005 @@ -66,7 +66,7 @@ #define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) #define MD_SB_PERSONALITY_WORDS 64 #define MD_SB_DESCRIPTOR_WORDS 32 -#define MD_SB_DISKS 27 +#define MD_SB_DISKS 26 #define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) #define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) #define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) @@ -154,6 +154,9 @@ /* * Reserved */ +#define MD_SB_BITMAP_REPAIR(sb) (sb)->reserved[0] +#define MD_SB_EVENTS_LO(sb) (sb)->reserved[2] +#define MD_SB_EVENTS_HI(sb) (sb)->reserved[3] __u32 reserved[MD_SB_RESERVED_WORDS]; /* --- linux-2.4.30/drivers/md/md.c.pre-fr1 Mon Aug 25 13:44:42 2003 +++ linux-2.4.30/drivers/md/md.c Wed Apr 6 22:41:29 2005 @@ -26,6 +26,19 @@ You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Changes 31/1/2003 by Peter T. Breuer to support + hotadd directly after setfaulty without intervening hotremove + ("hotrepair") when there is no persistent superblock, and to flag a + potential hotrepair when an old disk is re-added and the uuid matches + ours. The flag is used by the raid1 driver, at the moment, in order + to trigger an intelligent resync. + + Yet more changes by PTB 12/3/2003 to notify devices via ioctls when + they have been incorporated or removed from a raid array. + + Yet more changes by PTB 26/3/2004 to make the speed calculations + appropriate to fr1, and throttle by real i/o, not resync total. */ #include @@ -108,6 +121,13 @@ static int md_hardsect_sizes[MAX_MD_DEVS]; static int md_maxreadahead[MAX_MD_DEVS]; static mdk_thread_t *md_recovery_thread; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) +/* PTB md_throttle permits speed calculation adjustments from personality */ +#ifdef MODULE +static +#endif /* MODULE */ +atomic_t md_throttle[MAX_MD_DEVS]; +#endif /* MD_BITMAP_SUPPORT */ int md_size[MAX_MD_DEVS]; @@ -524,7 +544,8 @@ printk(NO_SB,partition_name(dev)); return -EINVAL; } - printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); + printk(KERN_INFO "%s (read) [events: %08lx]\n", + partition_name(rdev->dev), (unsigned long)rdev->sb->events_lo); ret = 0; abort: return ret; @@ -611,6 +632,68 @@ return 0; } +static int +md_hot_add_disk(kdev_t dev, int cmd) { + + static mdk_rdev_t * find_rdev_all(kdev_t dev); + static int hot_add_disk(mddev_t * mddev, kdev_t dev); + static int set_disk_faulty(mddev_t *mddev, kdev_t dev); + + mdk_rdev_t *rdev; + mddev_t *mddev; + int res; + + rdev = find_rdev_all(dev); + if (!rdev) + return -EINVAL; + mddev = rdev->mddev; + if (!mddev) + return -EINVAL; + + switch(cmd) { + case HOT_ADD_DISK: + res = hot_add_disk(mddev, dev); + return res; + case SET_DISK_FAULTY: + res = set_disk_faulty(mddev, dev); + return res; + } + return -EINVAL; +} + +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) +static void +notify_device (mddev_t * mddev, kdev_t dev) +{ +#ifndef BLKMDNTFY +#define BLKMDNTFY _IOW(0x12,133,int) +#endif + struct block_device *bdev; + printk (KERN_INFO "md%d: notifying dev %x\n", mdidx(mddev), dev); + bdev = bdget (dev); + if (!bdev) + return; + ioctl_by_bdev (bdev, BLKMDNTFY, MKDEV (MD_MAJOR, mddev->__minor)); +#ifndef BLKMDRGTR +#define BLKMDRGTR _IOW(0x12,135,unsigned long) +#endif + ioctl_by_bdev (bdev, BLKMDRGTR, (unsigned long)md_hot_add_disk); +} +static void +unnotify_device (mddev_t * mddev, kdev_t dev) +{ +#ifndef BLKMDUNTFY +#define BLKMDUNTFY _IOW(0x12,134,int) +#endif + struct block_device *bdev; + printk (KERN_INFO "md%d: unnotifying dev %x\n", mdidx(mddev), dev); + bdev = bdget (dev); + if (!bdev) + return; + ioctl_by_bdev(bdev, BLKMDUNTFY, MKDEV(MD_MAJOR, mddev->__minor)); +} +#endif /* MD_BITMAP_SUPPORT */ + static MD_LIST_HEAD(all_raid_disks); static MD_LIST_HEAD(pending_raid_disks); @@ -634,6 +717,9 @@ rdev->mddev = mddev; mddev->nb_dev++; printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + notify_device(mddev, rdev->dev); +#endif /* MD_BITMAP_SUPPORT */ } static void unbind_rdev_from_array(mdk_rdev_t * rdev) @@ -642,6 +728,9 @@ MD_BUG(); return; } +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + unnotify_device(rdev->mddev, rdev->dev); +#endif /* MD_BITMAP_SUPPORT */ md_list_del(&rdev->same_set); MD_INIT_LIST_HEAD(&rdev->same_set); rdev->mddev->nb_dev--; @@ -2383,6 +2472,9 @@ unsigned int size; mdk_rdev_t *rdev; mdp_disk_t *disk; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + int hotrepair = 0; +#endif /* MD_BITMAP_SUPPORT */ if (!mddev->pers) return -ENODEV; @@ -2398,11 +2490,48 @@ persistent = !mddev->sb->not_persistent; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* + * This is a do at most once loop because the remove in the loop will + * cause the test to fail the next time round. And if that + * doesn't break us out, then the hotrepair count will. + */ + while ((rdev = find_rdev(mddev, dev)) != NULL) { + + if (hotrepair || rdev->dev != dev || !rdev->faulty) { + printk(KERN_WARNING "md%d: cannot add existing component %x\n", + mdidx(mddev), dev); + return -EBUSY; + } + /* + * Allow "hotrepair" of merely faulty device too if no superblock to + * go by or (later) if there is a matching superblock. We assume then + * that hotadd after setfaulty of the same device is a + * hotrepair. + */ + printk(KERN_WARNING "md%d: repair of faulty disk %x!\n", + mdidx(mddev), dev); + + /* Remove will cause find_rdev to fail next time */ + err = hot_remove_disk(mddev, dev); + if (err < 0) { + printk(KERN_WARNING "md%d: remove disk %x errored\n", + mdidx(mddev), dev); + return err; + } + /* This will inevitably error us out of the loop interior next time */ + hotrepair = 1; + rdev = NULL; + } + + err = md_import_device (dev, persistent); +#else rdev = find_rdev(mddev, dev); if (rdev) return -EBUSY; err = md_import_device (dev, 0); +#endif /* MD_BITMAP_SUPPORT */ if (err) { printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); return -EINVAL; @@ -2426,6 +2554,58 @@ err = -ENOSPC; goto abort_export; } +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* let's check the new disk sb at this poimt */ + if (persistent && rdev->sb + && rdev->sb->set_uuid0 == mddev->sb->set_uuid0 + && rdev->sb->set_uuid1 == mddev->sb->set_uuid1 + && rdev->sb->set_uuid2 == mddev->sb->set_uuid2 + && rdev->sb->set_uuid3 == mddev->sb->set_uuid3) { + unsigned long long disk_events, bitmap_events; + disk_events = rdev->sb->events_lo; + bitmap_events = 0; + //bitmap_events |= MD_SB_EVENTS_HI(mddev->sb); + //bitmap_events <<= 32; + bitmap_events |= MD_SB_EVENTS_LO(mddev->sb); + + /* This is where we should examine conf->events_chkpt_* + */ + if (disk_events == bitmap_events - 1) { + printk(KERN_WARNING "md%d: warning - new disk %x nearly too old for repair (disk %Lu < bitmap %Lu)\n", + mdidx(mddev), dev, disk_events, bitmap_events); + } + if (disk_events < bitmap_events - 1) { + /* new disk is too old! */ + hotrepair = 0; + printk(KERN_INFO "md%d: new disk %x too old for repair (disk %Lu < bitmap %Lu)\n", + mdidx(mddev), dev, disk_events, bitmap_events); + } else { + hotrepair = 1; + printk(KERN_INFO "md%d: repairing old mirror component %x (disk %Lu >= bitmap %Lu)\n", + mdidx(mddev), dev, disk_events, bitmap_events); + } + } else if (!persistent && hotrepair) { + hotrepair = 1; + printk(KERN_INFO "md: forced repair of mirror component %x\n", + dev); + } else { + /* failed match */ + hotrepair = 0; + printk(KERN_INFO "md: adding new mirror component %x\n", + dev); + printk(KERN_DEBUG "md: old uuid %x %x %x %x\n", + mddev->sb->set_uuid0, + mddev->sb->set_uuid1, + mddev->sb->set_uuid2, + mddev->sb->set_uuid3); + printk(KERN_DEBUG "md: new uuid %x %x %x %x\n", + rdev->sb->set_uuid0, + rdev->sb->set_uuid1, + rdev->sb->set_uuid2, + rdev->sb->set_uuid3); + } +#endif /* MD_BITMAP_SUPPORT */ + bind_rdev_to_array(rdev, mddev); /* @@ -2480,6 +2660,17 @@ mddev->sb->spare_disks++; mddev->sb->working_disks++; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* + * Maybe say something nice - 1 means we want to respect + * the bitmap in raid1 resync if there is one, 0 + * means we need to kill any bitmap that we have been + * saving but we'll do it in the raid1 resync instead of here + */ + printk(KERN_DEBUG "md%d: set repair bit to %d on superblock\n", + mdidx(mddev), hotrepair); + MD_SB_BITMAP_REPAIR(mddev->sb) = hotrepair; +#endif /* MD_BITMAP_SUPPORT */ mddev->sb_dirty = 1; md_update_sb(mddev); @@ -3419,6 +3610,10 @@ mddev_t *mddev2; unsigned int max_sectors, currspeed, j, window, err, serialize; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* PTB add realspeed for i/o limiting calculation */ + unsigned realspeed; +#endif /* MD_BITMAP_SUPPORT */ unsigned long mark[SYNC_MARKS]; unsigned long mark_cnt[SYNC_MARKS]; int last_mark,m; @@ -3488,6 +3683,9 @@ atomic_set(&mddev->recovery_active, 0); init_waitqueue_head(&mddev->recovery_wait); last_check = 0; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + atomic_set(&md_throttle[mdidx(mddev)], 0); +#endif /* MD_BITMAP_SUPPORT */ for (j = 0; j < max_sectors;) { int sectors; @@ -3515,6 +3713,10 @@ mddev->resync_mark = mark[next]; mddev->resync_mark_cnt = mark_cnt[next]; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* PTB reset count of skipped blocks this mark */ + atomic_set(&md_throttle[mdidx(mddev)], 0); +#endif /* MD_BITMAP_SUPPORT */ mark[next] = jiffies; mark_cnt[next] = j - atomic_read(&mddev->recovery_active); last_mark = next; @@ -3540,16 +3742,34 @@ * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - if (md_need_resched(current)) - schedule(); + if (md_need_resched(current)) { + /* PTB this seems not to progress when over loop dev */ + + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(1); + } currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* + * PTB some of the blocks are skipped, not synced, so + * should not count when limiting i/o. Let personality say. + */ + realspeed = (j - mddev->resync_mark_cnt - atomic_read(&md_throttle[mdidx(mddev)]))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; +#endif /* MD_BITMAP_SUPPORT */ + if (currspeed > sysctl_speed_limit_min) { current->nice = 19; - if ((currspeed > sysctl_speed_limit_max) || - !is_mddev_idle(mddev)) { + if ( +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) + /* PTB use realspeed for upper limit on i/o */ + (realspeed > sysctl_speed_limit_max) || +#else + (currspeed > sysctl_speed_limit_max) || +#endif /* MD_BITMAP_SUPPORT */ + !is_mddev_idle(mddev)) { current->state = TASK_INTERRUPTIBLE; md_schedule_timeout(HZ/4); goto repeat; @@ -4108,6 +4328,9 @@ } #endif +#if defined(CONFIG_MD_BITMAP) || defined(CONFIG_MD_BITMAP_MODULE) +MD_EXPORT_SYMBOL(md_throttle); +#endif /* MD_BITMAP_SUPPORT */ MD_EXPORT_SYMBOL(md_size); MD_EXPORT_SYMBOL(register_md_personality); MD_EXPORT_SYMBOL(unregister_md_personality); --- linux-2.4.30/drivers/md/bitmap.c.pre-fr1 Wed Apr 6 18:18:04 2005 +++ linux-2.4.30/drivers/md/bitmap.c Wed Apr 6 18:18:04 2005 @@ -0,0 +1,880 @@ +/* + * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * bitmap_init - sets nr blks + * bitmap->start - then calls the setup part for the 1st + * level in the bitmap, which uses memory (kmalloc) so + * can fail. You should examine the return value. 0 is + * OK. -ve is FAIL. + * + * bitmap->stop - inverse to bitmap->start. kfrees the memory claimed in + * bitmap_init. + */ + +#include +#include +#include +#include +#include // PTB for kmalloc! How? +#include // PTB for vmalloc +#include +#include +#include + +# define DEBUG 1 + +#include "bitmap.h" + +/* use 16 bits of the address as extra bitmap */ +#define ZONESHIFT 4 +/* top 16 bits are nonzero */ +#define IS_ADDRESS(x) \ + ((( ((unsigned long)(x)) >> ((sizeof(char*)<<3) - (1< 0 +# define PRINTK(x...) printk(x) +# else +# define PRINTK(x...) +# endif +#endif + +/* common cache of ready to go pages */ +static kmem_cache_t * bitmap_page_cache; +/* list of all created bitmaps */ +struct list_head bitmap_list; + +/* + * replaces kfree on bitmap pages. + */ +static void +bitmap_free_page(struct bitmap * bitmap, unsigned char * page) { + + if (!page) + return; + + kmem_cache_free(bitmap_page_cache, (void *)page); +} + +/* + * frees mamory kmalloced in bitmap_init + */ +void +bitmap_destr(struct bitmap *bitmap) { + + unsigned long k; + struct bitmap_page * bp; + unsigned long pages; + static void bitmap_stop(struct bitmap * bitmap); + + bitmap_stop(bitmap); + + write_lock(&bitmap->lock); + bitmap->flags &= ~BITMAP_ACTIVE; + bp = bitmap->bp; + pages = bitmap->pages; + bitmap->bp = NULL; + list_del(&bitmap->list); + write_unlock(&bitmap->lock); + + if (bp) { + for (k = 0; k < pages; k++) { + unsigned char *mappage; + void *zoneinfo; + + mappage = bp[k].map; + zoneinfo = bp[k].zoneinfo; + bp[k].map = NULL; + bp[k].zoneinfo = NULL; + + if (IS_ADDRESS(mappage)) { + /* zero page returned to cache via abort */ + memset (mappage, 0, PAGE_SIZE); + bitmap_free_page (bitmap, mappage); + bitmap->missing_pages++; + } + if (IS_ADDRESS(zoneinfo)) { + kfree (zoneinfo); + bitmap->missing_zones++; + } + } + if (bitmap->missing_pages < pages) { + printk(KERN_WARNING "bitmap: mislaid %lu pages. Oops!\n", + pages - bitmap->missing_pages); + } + if (bitmap->missing_pages > pages) { + printk(KERN_WARNING "bitmap: freed %lu extra pages. Oops!\n", + bitmap->missing_pages - pages); + } + if (bitmap->missing_zones < pages) { + printk(KERN_WARNING "bitmap: mislaid %lu zoneinfo block. Oops!\n", + pages - bitmap->missing_zones); + } + if (bitmap->missing_zones > pages) { + printk(KERN_WARNING "bitmap: freed %lu extra zoneinfo block. Oops!\n", + bitmap->missing_zones - pages); + } + vfree (bp); + } + + bitmap_stop(bitmap); + +} + +/* + * tests if the bitmap is marked active (has been started) + */ +static int +bitmap_active(struct bitmap * bitmap) { + int res = 0; + if (!bitmap) + return res; + read_lock(&bitmap->lock); + res = bitmap->bp != NULL && (bitmap->flags & BITMAP_ACTIVE) != 0; + read_unlock(&bitmap->lock); + return res; +} + +/* + * replaces kmalloc for bitmap pages. + */ +static unsigned char * +bitmap_alloc_page(struct bitmap *bitmap) { + unsigned char *page; + + page = kmem_cache_alloc(bitmap_page_cache, GFP_KERNEL); + /* PTB zeroing is done by the constructor and only + * clean pages are returned to the cache + */ + return page; +} + +/* + * mark bitmap inactive and maybe prune the page cache + */ +static void +bitmap_stop(struct bitmap * bitmap) { + + write_lock(&bitmap->lock); + bitmap->flags &= ~BITMAP_ACTIVE; + write_unlock(&bitmap->lock); + +} + +/* + * marks the bitmap active and primes the free page cache. + */ +static int +bitmap_start(struct bitmap * bitmap, u64 events) { + + struct bitmap_page * bp; + unsigned long pages; + + /* take lock to read data */ + write_lock(&bitmap->lock); + pages = bitmap->pages; + bp = bitmap->bp; + write_unlock(&bitmap->lock); + + if (!bp) { + + int k; + + bp = vmalloc (pages * sizeof(*bp)); + if (!bp) { + printk(KERN_WARNING "bitmap: cannot get %luB of memory!\n", + pages * sizeof(*bp)); + return -ENOMEM; + } + memset (bp, 0, pages * sizeof(*bp)); + + for (k = 0; k < pages; k++) { + if (bp[k].zoneinfo) + continue; + bp[k].zoneinfo = + kmalloc (sizeof(*bp[k].zoneinfo)<missing_zones--; + } + if (bitmap->missing_zones > 0) { + printk(KERN_WARNING "bitmap: warning! cannot get %ld*%uB memory!\n", + bitmap->missing_zones, + sizeof(*bp->zoneinfo) << ZONESHIFT); + } + } + + /* + * this is 16 shorts or 32 bytes + 4 bytes extra per page of 4096 + * bytes, which is a reserve of less than 0.1%. Each page + * bitmaps 32MB of disk, so a 1GB disk takes 32 pages or 128KB, and + * a 1TB disk takes 128MB of pages. In those circumstances, + * adding a capital cost of about 1.08MB/TB doesn't seem bad. + */ + + + write_lock(&bitmap->lock); + bitmap->bp = bp; + + /* bp is nonzero here */ + bitmap->flags |= BITMAP_ACTIVE; + bitmap->events = events; + write_unlock(&bitmap->lock); + + /* seed the page cache */ + bitmap_free_page(bitmap, bitmap_alloc_page(bitmap)); + + return 0; +} + +static int +bitmap_checkpage (struct bitmap *bitmap, unsigned long page) +{ + unsigned char * mappage; + + read_lock(&bitmap->lock); + if (page < 0 || page >= bitmap->pages) { + read_unlock(&bitmap->lock); + return -EINVAL; + } + + if (bitmap->bp == NULL) { + read_unlock(&bitmap->lock); + return -ENODEV; + } + + + if (IS_ADDRESS(bitmap->bp[page].map)) { + read_unlock(&bitmap->lock); + return 0; + } + read_unlock(&bitmap->lock); + + /* the page address was NULL */ + + if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { + /* failed - check to see if we have backup counters */ + void * tmp; + int need_new_zoneinfo = 0; + + write_lock(&bitmap->lock); + if (!IS_ADDRESS(bitmap->bp[page].zoneinfo)) + need_new_zoneinfo = 1; + write_unlock(&bitmap->lock); + + if (need_new_zoneinfo) { + /* rarely, we might make an extra backup counter */ + tmp = kmalloc(sizeof(*bitmap->bp->zoneinfo)<lock); + if (IS_ADDRESS(bitmap->bp[page].zoneinfo)) { + /* somebody else made it first, backout */ + need_new_zoneinfo = 0; + } else { + bitmap->bp[page].zoneinfo = tmp; + bitmap->missing_zones--; + } + write_unlock(&bitmap->lock); + + if (!need_new_zoneinfo) + kfree(tmp); + } + } + return -ENOMEM; + } + + /* got a page */ + + write_lock(&bitmap->lock); + + /* recheck the page */ + + if (IS_ADDRESS(bitmap->bp[page].map)) { + /* somebody beat us to getting the page */ + write_unlock(&bitmap->lock); + bitmap_free_page(bitmap, mappage); + return 0; + } + + /* no page in place and we have one, so maybe install it */ + + if (bitmap->bp[page].count != 0) { + /* inpage bitmap - can't replace until no pending writes */ + write_unlock(&bitmap->lock); + bitmap_free_page(bitmap, mappage); + return -EINVAL; + } + + /* good case - we get to make a new page */ + memset(mappage, 0, PAGE_SIZE); + bitmap->bp[page].map = mappage; + if (IS_ADDRESS(bitmap->bp[page].zoneinfo)) { + memset(bitmap->bp[page].zoneinfo, 0, + sizeof(*bitmap->bp->zoneinfo) << ZONESHIFT); + } + bitmap->missing_pages--; + write_unlock(&bitmap->lock); + return 0; + +} + +/* + * offset8 is the BYTE offset, not the bit offset + * We call this routine under lock. + */ +static int +bitmap_clear_mask8 (struct bitmap *bitmap, bitmap_offset_t offset8, + unsigned char mask, unsigned char **this_page) +{ + + unsigned long page ; + unsigned long pageoff; + + unsigned char oldmask; + unsigned char newmask; + unsigned char diffmask; + + page = offset8 >> PAGE_SHIFT; + + if (!IS_ADDRESS(bitmap->bp[page].map)) { + int bits = hweight8(mask); + bitmap->bp[page].count -= bits; + if (IS_ADDRESS(bitmap->bp[page].zoneinfo)) { + int zoneoffset = (offset8 >> (PAGE_SHIFT - ZONESHIFT)); + int zone = zoneoffset & ((1<bp[page].zoneinfo[zone] -= bits; + } + return -EINVAL; + } + + pageoff = offset8 & ~PAGE_MASK; + + oldmask = bitmap->bp[page].map[pageoff]; + newmask = oldmask & ~mask; + diffmask = newmask ^ oldmask; + + if (diffmask) { + unsigned bits = hweight8 (diffmask); + int newcount = (bitmap->bp[page].count -= bits); + + bitmap->bp[page].map[pageoff] = newmask; + + /* most frequent case is a +ve result and return */ + if (newcount > 0) + return 0; + /* negative count is a major misaccounting */ + if (newcount < 0) { + printk(KERN_WARNING "bitmap: dirty count %d on page %lu\n", + newcount, page); + return 0; + } + /* newcount == 0 is when we want to detach the page */ + *this_page = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + return 0; + } + return 0; +} + +static int +bitmap_clear_mask (struct bitmap *bitmap, bitmap_offset_t offset, unsigned char mask) +{ + + unsigned long blkgrp; + unsigned char blkoff; + unsigned long page ; + unsigned long pageoff; + int errs = 0; + unsigned char * free_page0 = NULL, *free_page1 = NULL; + + blkgrp = offset >> 3; + page = blkgrp >> PAGE_SHIFT; + + blkoff = offset & 7; + pageoff= blkgrp & ~PAGE_MASK; + + write_lock(&bitmap->lock); + + if (blkoff) { + unsigned char maskdiv = 0xff & (mask << blkoff); + unsigned char maskrem = 0xff & (mask >> (8 - blkoff)); + + if (maskdiv) { + if (bitmap_clear_mask8 (bitmap, blkgrp, maskdiv, &free_page0) < 0) + errs++; + } + if (!maskrem) { + goto out; + } + if (pageoff + 1 < PAGE_SIZE) { + + if (bitmap_clear_mask8 (bitmap, blkgrp + 1, maskrem, &free_page1) < 0) + errs++; + goto out; + } + + if (bitmap_clear_mask8 (bitmap, blkgrp + 1, maskrem, &free_page1) < 0) + errs++; + goto out; + } + + // normal situation. Offset is multiple of 8 + + if (bitmap_clear_mask8 (bitmap, blkgrp, mask, &free_page0) < 0) + errs++; +out: + write_unlock(&bitmap->lock); + if (free_page0) + bitmap_free_page(bitmap, free_page0); + if (free_page1) + bitmap_free_page(bitmap, free_page1); + return errs > 0 ? -EINVAL : 0; +} + + +/* + * offset8 is the BYTE offset, not the bit offset. + * We call this routine under lock. + */ +static int +bitmap_set_mask8 (struct bitmap *bitmap, bitmap_offset_t offset8, + unsigned char mask) +{ + + unsigned long page ; + unsigned long pageoff; + + unsigned char oldmask; + unsigned char newmask; + unsigned char diffmask; + + page = offset8 >> PAGE_SHIFT; + pageoff = offset8 & ~PAGE_MASK; + + if (!IS_ADDRESS(bitmap->bp[page].map)) { + int bits = hweight8(mask); + bitmap->bp[page].count += bits; + /* mark the zone instead - we have 16 low bits markable */ + if (IS_ADDRESS(bitmap->bp[page].zoneinfo)) { + int zoneoffset = (offset8 >> (PAGE_SHIFT - ZONESHIFT)); + int zone = zoneoffset & ((1 << ZONESHIFT) - 1); + bitmap->bp[page].zoneinfo[zone] += bits; + } + return -EINVAL; + } + + oldmask = bitmap->bp[page].map[pageoff]; + newmask = oldmask | mask; + diffmask = newmask ^ oldmask; + + if (diffmask) { + unsigned int bits = hweight8 (diffmask); + bitmap->bp[page].map[pageoff] = newmask; + bitmap->bp[page].count += bits; + } + return 0; +} + +/* + * here offset is the BIT offset + */ +static int +bitmap_set_mask (struct bitmap *bitmap, bitmap_offset_t offset, unsigned char mask) +{ + + unsigned long blkgrp; + unsigned char blkoff; + unsigned long page ; + unsigned long pageoff; + int errs = 0; + + blkgrp = offset >> 3; + page = blkgrp >> PAGE_SHIFT; + + if (bitmap_checkpage(bitmap, page) < 0) + errs++; + + /* the page may or may not have been made */ + + write_lock(&bitmap->lock); + blkoff = offset & 7; + pageoff= blkgrp & ~PAGE_MASK; + + if (blkoff) { + unsigned char maskdiv = 0xff & (mask << blkoff); + unsigned char maskrem = 0xff & (mask >> (8 - blkoff)); + + if (maskdiv) { + if (bitmap_set_mask8(bitmap, blkgrp, maskdiv) < 0) + errs++; + } + + if (!maskrem) { + write_unlock(&bitmap->lock); + return errs > 0 ? -EINVAL : 0 ; + } + + if (pageoff + 1 < PAGE_SIZE) { + + if (bitmap_set_mask8(bitmap, blkgrp + 1, maskrem) < 0) + errs++; + write_unlock(&bitmap->lock); + return errs > 0 ? -EINVAL : 0 ; + } + write_unlock(&bitmap->lock); + if (bitmap_checkpage(bitmap, page+1) < 0) + errs++; + + write_lock(&bitmap->lock); + + if (bitmap_set_mask8(bitmap, blkgrp + 1, maskrem) < 0) + errs++; + + write_unlock(&bitmap->lock); + return errs > 0 ? -EINVAL : 0 ; + } + + // normal situation. Offset is multiple of 8 + + if (bitmap_set_mask8(bitmap, blkgrp, mask) < 0) + errs++; + + write_unlock(&bitmap->lock); + return errs > 0 ? -EINVAL : 0; +} + +/* + * offset8 is the BYTE offset, not the bit offset. + * We call this routine under lock. + */ +static int +bitmap_test_mask8 (struct bitmap *bitmap, bitmap_offset_t offset8, + unsigned char mask) +{ + unsigned long page ; + unsigned long pageoff; + + if (!mask) + return 0; + + page = offset8 >> PAGE_SHIFT; + pageoff = offset8 & ~PAGE_MASK; + + if (!IS_ADDRESS(bitmap->bp[page].map)) { + /* look at zone instead - we have 16 low bits markable */ + if (IS_ADDRESS(bitmap->bp[page].zoneinfo)) { + int zoneoffset = (offset8 >> (PAGE_SHIFT - ZONESHIFT)); + int zone = zoneoffset & ((1 << ZONESHIFT) - 1); + if (bitmap->bp[page].zoneinfo[zone] > 0) + return 1; + return 0; + } + /* try the count */ + if (bitmap->bp[page].count > 0) + return 1; + return 0; + } + + return (bitmap->bp[page].map[pageoff] & mask) != 0; +} + +/* + * here offset is the BIT offset + */ +static int +bitmap_test_mask (struct bitmap *bitmap, bitmap_offset_t offset, unsigned char mask) +{ + + unsigned long blkgrp; + unsigned char blkoff; + unsigned long page ; + unsigned long pageoff; + int err; + + blkgrp = offset >> 3; + page = blkgrp >> PAGE_SHIFT; + + if (bitmap_checkpage(bitmap, page) < 0) + ; // ignore. We'll react below + + /* the page may or may not have been made */ + + write_lock(&bitmap->lock); + blkoff = offset & 7; + pageoff= blkgrp & ~PAGE_MASK; + + if (blkoff) { + unsigned char maskdiv = 0xff & (mask << blkoff); + unsigned char maskrem = 0xff & (mask >> (8 - blkoff)); + + if (maskdiv) { + if (err = bitmap_test_mask8(bitmap, blkgrp, maskdiv), err) { + write_unlock(&bitmap->lock); + return err; + } + } + + if (!maskrem) { + write_unlock(&bitmap->lock); + return 0 ; + } + + if (pageoff + 1 < PAGE_SIZE) { + + if (err = bitmap_test_mask8(bitmap, blkgrp + 1, maskrem), err) { + write_unlock(&bitmap->lock); + return err; + } + write_unlock(&bitmap->lock); + return 0 ; + } + write_unlock(&bitmap->lock); + if (err = bitmap_checkpage(bitmap, page+1), err < 0) + ; // ignore errors + + write_lock(&bitmap->lock); + + if (err = bitmap_test_mask8(bitmap, blkgrp + 1, maskrem), err) { + write_unlock(&bitmap->lock); + return err; + } + + write_unlock(&bitmap->lock); + return 0 ; + } + + // normal situation. Offset is multiple of 8 + + if (err = bitmap_test_mask8(bitmap, blkgrp, mask), err) { + write_unlock(&bitmap->lock); + return err ; + } + + write_unlock(&bitmap->lock); + return 0; +} + +static int +bitmap_clear_bits (struct bitmap *bitmap, bitmap_offset_t offset, unsigned long bits) +{ + + unsigned char mask; + int errs = 0; + + while (offset + bits > (offset | 7)) { + int more = (offset | 7) + 1 - offset; + mask = (1 << more) - 1; + /* ignore errors and do what we can */ + if (bitmap_clear_mask(bitmap, offset, mask) < 0) { + errs++; + } + bits -= more; + offset |= 7; + offset++; + } + + if (bits > 0) { + mask = (1 << bits) - 1; + /* ignore errors and do what we can */ + if (bitmap_clear_mask(bitmap, offset, mask) < 0) { + errs++; + } + bits = 0; + offset += bits; + } + + return (errs > 0) ? -EINVAL : 0; +} + +static int +bitmap_set_bits (struct bitmap *bitmap, bitmap_offset_t offset, unsigned long bits) +{ + unsigned char mask; + int errs = 0; + + while (offset + bits >= (offset | 7) + 1) { + int more = (offset | 7) + 1 - offset; + mask = (1 << more) - 1; + /* ignore errors and do what we can */ + if (bitmap_set_mask(bitmap, offset, mask) < 0) { + errs++; + } + bits -= more; + offset |= 7; + offset++; + } + + if (bits > 0) { + mask = (1 << bits) - 1; + /* ignore errors and do what we can */ + if (bitmap_set_mask(bitmap, offset, mask) < 0) { + errs++; + } + bits = 0; + offset += bits; + } + + return (errs > 0) ? -EINVAL : 0; +} + +static int +bitmap_test_bits (struct bitmap *bitmap, bitmap_offset_t offset, unsigned long bits) +{ + unsigned char mask; + int err; + + while (offset + bits >= (offset | 7) + 1) { + int more = (offset | 7) + 1 - offset; + mask = (1 << more) - 1; + /* ignore errors and do what we can */ + if (err = bitmap_test_mask(bitmap, offset, mask), err) { + return err; + } + bits -= more; + offset |= 7; + offset++; + } + + if (bits > 0) { + mask = (1 << bits) - 1; + /* ignore errors and do what we can */ + if (err = bitmap_test_mask(bitmap, offset, mask), err) { + return err; + } + bits = 0; + offset += bits; + } + return 0; +} + +/* +static int +bitmap_test_bit (struct bitmap *bitmap, unsigned long block) +{ + unsigned long blkgrp; + unsigned char blkoff; + unsigned long page ; + unsigned long pageoff; + int res; + + blkgrp = block >> 3; + page = blkgrp >> PAGE_SHIFT; + + read_lock(&bitmap->lock); + // high bits zero means no page address + if (!IS_ADDRESS(bitmap->bp[page].map)) { + int zoneoffset = (blkgrp >> (PAGE_SHIFT - ZONESHIFT)); + int zone = zoneoffset & ((1 << ZONESHIFT) - 1); + // use the counter instead - this is zoned + res = (bitmap->bp[page].count > 0); + if (res && IS_ADDRESS(bitmap->bp[page].zoneinfo)) + res = (bitmap->bp[page].zoneinfo[zone] > 0); + read_unlock(&bitmap->lock); + return res; + } + + blkoff = block & 7; + pageoff= blkgrp & ~PAGE_MASK; + + res = test_bit ((pageoff << 3) + blkoff, bitmap->bp[page].map) != 0; + read_unlock(&bitmap->lock); + return res; +} +*/ + + +int +bitmap_init(struct bitmap * bitmap, bitmap_offset_t bits) { + + int pages = (bits + (PAGE_SIZE * 8 - 1)) / (PAGE_SIZE * 8); + + memset(bitmap, 0, sizeof(*bitmap)); + rwlock_init (&bitmap->lock); + write_lock(&bitmap->lock); + + bitmap->start = bitmap_start; + bitmap->stop = bitmap_stop; + bitmap->testbits = bitmap_test_bits; + bitmap->setbits = bitmap_set_bits; + bitmap->clearbits = bitmap_clear_bits; + bitmap->active = bitmap_active; + + /* now do 1st level init stuff */ + if (pages < 0) { + write_unlock(&bitmap->lock); + printk(KERN_WARNING "bitmap: initialised for -ve number of pages (%d)!\n", + pages); + return -EINVAL; + } + bitmap->pages = pages; + bitmap->missing_zones = pages; + bitmap->missing_pages = pages; + list_add(&bitmap->list, &bitmap_list); + write_unlock(&bitmap->lock); + return 0; +} + +static void +bitmap_clear_page(void *data, kmem_cache_t *cache, unsigned long flags) { + + if (! (flags & SLAB_CTOR_CONSTRUCTOR)) + return; + if (!data) + return; + memset(data, 0, PAGE_SIZE); +} + +int +bitmap_init_list(void) { + INIT_LIST_HEAD(&bitmap_list); + return 0; +} + +int +bitmap_init_page_cache(void) { + + bitmap_page_cache = + kmem_cache_create("bitmap_page", PAGE_SIZE, 0, 0, bitmap_clear_page, NULL); + if (!bitmap_page_cache) + return -ENOMEM; + return 0; +} + +#ifdef MODULE +void cleanup_module(void) +{ + struct bitmap *bitmap, *tbm; + list_for_each_entry_safe(bitmap, tbm, &bitmap_list, list) { + printk(KERN_WARNING "bitmap: destroyed leftover bitmap %p in cleanup.\n", bitmap); + bitmap_destr(bitmap); + } + INIT_LIST_HEAD(&bitmap_list); + if (!bitmap_page_cache) + return; + kmem_cache_destroy(bitmap_page_cache); + bitmap_page_cache = NULL; +} + +int init_module(void) +{ + return bitmap_init_list() || bitmap_init_page_cache(); +} + + MODULE_AUTHOR ("Peter T. Breuer"); + MODULE_DESCRIPTION ("Bitmap support"); + MODULE_LICENSE("GPL"); + int linux_version_code = LINUX_VERSION_CODE; +#else /* MODULE */ +__initcall(bitmap_init_list); +__initcall(bitmap_init_page_cache); +#endif /* MODULE */ + +/* Compile line: + * + * gcc -O2 -D__KERNEL__ -DMODULE -c bitmap.c -o bitmap.o + * + */ + --- linux-2.4.30/drivers/md/bitmap.h.pre-fr1 Wed Apr 6 18:18:04 2005 +++ linux-2.4.30/drivers/md/bitmap.h Wed Apr 6 18:18:04 2005 @@ -0,0 +1,57 @@ +#ifndef BITMAP_H +#define BITMAP_H 1 + +typedef __s64 bitmap_offset_t; + +struct bitmap_page { + /* + * If a page is missing then we use a per + * page pending write count instead. pages is the number of + * 4k pages in the map. + */ + char * map; + /* + * more precise count per zone (1/16 page), for emergencies. + */ + short *zoneinfo; + /* + * count of dirty bits on the page + */ + unsigned short count; +}; + +struct bitmap { + struct bitmap_page * bp; + unsigned long pages; + + int (*start) (struct bitmap * bitmap, __u64 events); + void (*stop) (struct bitmap * bitmap); + int (*testbits) (struct bitmap * bitmap, bitmap_offset_t shift, unsigned long nbits); + int (*setbits) (struct bitmap * bitmap, bitmap_offset_t shift, unsigned long nbits); + int (*clearbits) (struct bitmap * bitmap, bitmap_offset_t shift, unsigned long nbits); + int (*active) (struct bitmap * bitmap); + + /* bitmap spinlock */ + rwlock_t lock; + +#define BITMAP_ACTIVE 0x01 + unsigned long flags; + + /* + * events count at startup of the bitmap + */ + __u64 events; + + /* + * number of missing zoneinfo sections + */ + unsigned long missing_zones; + unsigned long missing_pages; + struct list_head list; +}; + + +int bitmap_init(struct bitmap * bitmap, bitmap_offset_t bits); +void bitmap_destr(struct bitmap * bitmap); + +#endif --- linux-2.4.30/drivers/md/Makefile.pre-fr1 Sun Nov 11 19:09:32 2001 +++ linux-2.4.30/drivers/md/Makefile Wed Apr 6 18:18:04 2005 @@ -7,6 +7,7 @@ export-objs := md.o xor.o list-multi := lvm-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o +fr1-objs := raid1.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise @@ -17,6 +18,8 @@ obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +obj-$(CONFIG_MD_BITMAP) += bitmap.o +obj-$(CONFIG_MD_FR1) += fr1.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o @@ -25,3 +28,9 @@ lvm-mod.o: $(lvm-mod-objs) $(LD) -r -o $@ $(lvm-mod-objs) + +fr1.o: $(fr1-objs) + $(LD) -r -o $@ $(fr1-objs) + +clean: + rm -f *.o .*.sw? --- linux-2.4.30/drivers/md/Config.in.pre-fr1 Fri Sep 14 23:22:18 2001 +++ linux-2.4.30/drivers/md/Config.in Wed Apr 6 18:18:04 2005 @@ -10,7 +10,14 @@ dep_tristate ' Linear (append) mode' CONFIG_MD_LINEAR $CONFIG_BLK_DEV_MD dep_tristate ' RAID-0 (striping) mode' CONFIG_MD_RAID0 $CONFIG_BLK_DEV_MD dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD +if [ "$CONFIG_MD_RAID1" != "n" ]; then + dep_bool ' RAID-1 robust read protocol' CONFIG_MD_RAID1_ROBUST_READ $CONFIG_BLK_DEV_MD +fi dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD +dep_tristate ' Bitmap support for fast raid' CONFIG_MD_BITMAP $CONFIG_BLK_DEV_MD +if [ "$CONFIG_MD_RAID1" != "y" ]; then + dep_tristate ' FR-1 (fast intelligent mirroring) mode' CONFIG_MD_FR1 $CONFIG__MD_BITMAP +fi dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD --- linux-2.4.30/Documentation/Configure.help.pre-fr1 Mon Apr 4 03:42:19 2005 +++ linux-2.4.30/Documentation/Configure.help Wed Apr 6 18:18:04 2005 @@ -2050,6 +2050,53 @@ If unsure, say Y. +FAST RAID-1 (mirroring) mode +CONFIG_MD_FR1 + This driver offers a faster software RAID-1 performance when + resynchronizing disks and reading, offers asynchronous writes, and has + various optimizations designed to automate administration. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + . There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use a FR-1 array, say Y. This code is also + available as a module called fr1.o ( = code which can be inserted + in and removed from the running kernel whenever you want). If you + want to compile it as a module, say M here and read + . You cannot compile both this and + RAID1 into the kernel, so you may prefer to say M. + + If unsure, say Y. + +FAST RAID-1 bitmap support +CONFIG_MD_BITMAP + This driver provides the needed bitmap support for the Fast RAID-1 + module FR1. + + Information about Software RAID on Linux is contained in the + Software-RAID mini-HOWTO, available from + . There you will also + learn where to get the supporting user space utilities raidtools. + + If you want to use Fast RAID-1, say Y. This code is also + available as a module called bitmap.o ( = code which can be inserted + in and removed from the running kernel whenever you want). If you + want to compile it as a module, say M here and read + . You cannot compile both this and + FR1 into the kernel, so you may prefer to say M. + + If unsure, say Y. + +Robust RAID-1 read protocol +CONFIG_MD_RAID1_ROBUST_READ + This modifcation changes the RAID-1 read protocol to withstand read + errors without ejecting the disk from the array. The read is retried + from another mirror instead. + + If unsure, say N. + RAID-4/RAID-5 mode CONFIG_MD_RAID5 A RAID-5 set of N drives with a capacity of C MB per drive provides