--- linux-2.6/drivers/Makefile.orig 2005-07-14 22:43:50.839607298 +0200 +++ linux-2.6/drivers/Makefile 2005-07-18 01:56:55.885417920 +0200 @@ -42,6 +42,7 @@ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ +obj-$(CONFIG_ENBD) += block/enbd/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ obj-$(CONFIG_PARIDE) += block/paride/ obj-$(CONFIG_TC) += tc/ --- linux-2.6.7/drivers/block/enbd/Kconfig.pre-enbd +++ linux-2.6.7/drivers/block/enbd/Kconfig Thu Jan 15 22:09:29 2004 @@ -0,0 +1,47 @@ +# +# ENBD configuration +# +#comment "Enhanced network block device" + +config BLK_DEV_ENBD + tristate 'Enhanced network block device driver' + depends on ENBD + ---help--- + Saying Y here will build in support for the "enhanced network + block device". Like the plain NBD, this device proxies a remote + hard disk or other block device, such as a cdrom or floppy. The + difference between ENBD and NBD is that ENBD is a much more + heavyweight solution for an industrial setting - it does automatic + reconnects after network brownouts, and uses multiple channels at + once to carry the data. It supports remote ioctls, removable + devices, and uses an MD5 sampling technique to accelerate softRAID + resyncs. It will connect through (secure) SSL channels. It will + hide transient errors from overlying softRAID devices, etc. + + You will need the userspace daemons, which packages are mirrored + on several places on the net. The primary source is + ftp://oboe.it.uc3m.es/pub/Programs/nbd/nbd-2.4-current.tgz. + + If you want to compile this driver as a module ( = code which can + be inserted in and removed from the running kernel whenever you + want), say M here and read . The + module will be called enbd.o. + + If unsure, say N. + +config BLK_DEV_ENBD_IOCTL + tristate 'Enhanced network block device remote ioctl support' + depends on BLK_DEV_ENBD + ---help--- + Saying Y here will build in support to ENBD for execution + of remote ioctls on compatible architectures. This means that + you can eject a CD on the other machine. + + If you want to compile this driver as a module ( = code which can + be inserted in and removed from the running kernel whenever you + want), say M here and read . The + module will be called enbd.o. + + If unsure, say N. +#endmenu + --- linux-2.6.7/drivers/block/enbd/Makefile.pre-enbd +++ linux-2.6.7/drivers/block/enbd/Makefile Thu Jan 15 22:09:29 2004 @@ -0,0 +1,11 @@ +# +# Makefile for ENBD (Peter T. Breuer ptb@it.uc3m.es) +# +# + +obj-$(CONFIG_BLK_DEV_ENBD) += enbd.o +obj-$(CONFIG_BLK_DEV_ENBD_IOCTL) += enbd_ioctl.o + +enbd-objs := enbd_base.o enbd_ioctl_stub.o enbd_seqno.o enbd_md.o enbd_speed.o enbd_proc.o + + --- linux-2.6.7/drivers/block/enbd/enbd_base.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_base.c Sun Aug 8 12:17:35 2004 @@ -0,0 +1,5302 @@ +/* + * (Enhanced) Network block device - make block devices work over TCP + * + * Original NBD Copyright 1997 Pavel Machek + * Further ENBD Copyrights 1998, 1999, 2000 Peter Breuer + * + * + * + * ATTENTION: You need the userspace daemons available from + * ftp://oboe.it.uc3m.es/pub/Programs/nbd-2.4.*.tgz + * and/or the ENBD project on http://freshmeat.net + * + * + * + * Development of the ENBD software has been supported by grants and + * contributions from Realm Information Technologies, Inc. of 5555 + * Oakbrook Parkway, NW Norcross, GA and iNsu Innovations Inc. of + * 3465, Boulevard Thimens, Saint-Laurent, Quebec, Canada. + * + * ------------ Pavel's history notes ---------------------------------- + * 97-3-25 compiled 0-th version, not yet tested it + * (it did not work, BTW) (later that day) HEY! it works! + * (bit later) hmm, not that much... 2:00am next day: + * yes, it works, but it gives something like 50kB/sec + * 97-3-28 it's completely strange - when using 1024 byte "packets" + * it gives 50kB/sec and CPU idle; with 2048 bytes it gives + * 500kB/sec (and CPU loaded 100% as it should be) (all done + * against localhost) + * 97-4-1 complete rewrite to make it possible for many requests at + * once to be processed + * 97-4-1 23:57 rewrite once again to make it work :-( + * 97-4-3 00:02 hmm, it does not work. + * 97-4-3 23:06 hmm, it will need one more rewrite :-) + * 97-4-10 It looks like it's working and stable. But I still do not + * have any recovery from lost connection... + * (setq tab-width 4) + * 97-4-11 Making protocol independent of endianity etc. + * 97-4-15 Probably one more rewrite, since it loses requests under + * heavy loads + * 97-9-13 Cosmetic changes + * + * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall + * why not: would need verify_area and friends, would share yet another + * structure with userland + * + * FIXME: not module-safe + * + * ------------ Peter's history notes ---------------------------------- + * 98-12-18 modules now OK ptb@it.uc3m.es (Peter Breuer) ported to + * 2.0.*. + better debugging. Still possible lockup in connection with APM + * and spurious interrupt - only on write. Error treatment should + * be improved. After 100 errors from end_request the kernel can + * do anything. We should catch it ourselves. + * 99-1-sometime fixed lockup by extending semaphore - ptb v1.0. + * 99-3-sometime reconnect protocol (client mod agreed by pavel) - ptb v1.1 + * 99-4-25 add /proc/nbdinfo - ptb v1.1.1 + * 99-4-sometime add multiplex - ptb v1.2 + * 99-4-26 fix multiplex and redundancy - ptb v1.2.1 + * 99-4-29 reentrant client threads - ptb v1.2.2 + * 99-4-29 socket related stuff placed in user space - amarin v1.3.0 + * 99-5-3 fix all, all writes had to be before all reads - ptb v1.2.4 + * 99-5-5 fix out-of-order, async - ptb v1.2.5 + * 99-5-7 semaphores removed (still works!), fail cases corrected - ptb v1.2.6 + * 99-5-12 signals unblocked in xmit, blksize != 1024 fixed, ioctls + * added - ptb v1.2.7 + * 99-6-1 interaction with client split into two functions - amarin v1.3.0 + * 99-6-3 reintegrated fully, mem manager fixed, accounts fixed - ptb v1.2.8.3 + * 99-6-3 extra queue removed, mem manager removed - ptb v1.2.8.4 + * 99-7-3 buffer registration introduced - ptb v1.2.8.5 + * 99-7-3 some client redundancy reestablished - ptb v2.1.1 + * 99-7-10 encapsulated queue calls. One element rollback buffer - ptb v2.1.2 + * 99-7-20 timestamp and rollback old abandoned request - ptb v2.1.3 + * 99-7-24 64bit file sizes and offsets accepted - ptb v2.1.9 + * 99-7-26 experimental request coalesces - ptb v2.1.10 + * 99-7-27 partitioning scheme - ptb v2.2.1 + * 99-8-3 enbd_clr_sock bug in invalidate_device fixed? - ptb v2.2.4 + * 99-8-5 reverse replace of block_fsync, add sig ioctls - ptb v2.2.5 + * reverse bug introduced about v2.2.3 for compound reqs - ptb v2.2.5 + * fix clear_que bug (didn't rollback first) from 2.1.3 - ptb v2.2.5 + * 99-8-22 workaround strange nr_sectors bug - ptb v2.2.6 + * 99-8-11 fix MY_NBD_SYNC bug. Never sync'ed all - ptb v2.2.7 + * 99-8-12 wakeups all moved to enqueue - ptb v2.2.7 + * 99-8-23 remove slot->cli_age - ptb v2.2.7 + * 99-8-24 first 8 bytes of signature embedded in packets - ptb v2.2.8 + * fix SET_SIG define buglet, remove hardcoded constants - ptb v2.2.8 + * fix huge bug. Missing copy_fromfs in my_nbd_ack - ptb v2.2.8 + * removed signature embedding and all other decorations - ptb v2.2.8 + * 99-8-25 recast fix in my_nbd_ack to avoid align. bug - ptb v2.2.9 + * put in MKDEVs and put back some hardcode const fixes - ptb v2.2.10 + * 99-9-29 fix BLKGETSIZE bug - ptb v2.2.14 + * 99-10-2 run with interrupts on throughout. Think we lose some - ptb v2.2.15 + * 99-10-8 trim dead code, kernel 2.2 ifdef's - ptb v2.2.17 + * 99-12-18 further o-o - ptb v2.2.19 + * 99-12-28 queue account cleanup. endio on queue reqs at reset - ptb v2.2.20 + * interruptible semaphores for better client recovery - ptb v2.2.20 + * 00-1-2 debugging cleanups. Fix race in end_request - ptb v2.2.21 + * 00-1-4 semaphores simplified. - ptb v2.2.22 + * 00-6-8 emergency control by write to proc - ptb v2.2.24 + * 00-7-20 ported to 2.4.0-test1. Possible minor bugs found/fixed - ptb v2.2.24 + * 00-7-27 changed proc i/f to read_proc from get_info in 2.2/2.4 - ptb v2.2.25 + * 00-7-30 fixed reads before writes under 2.4 by disabling merge - ptb v2.2.25 + * 00-7-30 and fixed merge_reqs for 2.4, now that I understand! - ptb v2.2.25 + * 00-7-30 fixed/introduced possible bug in end_io for 2.2/2.4 - ptb v2.2.25 + * 00-7-30 added timeval/zone field in requests and replies - ptb v2.4.0 + * 00-7-30 fixed hitherto masked bug in read_stat in enbd_client - ptb v2.4.0 + * 00-7-30 added timeout to net writes in enbd_client - ptb v2.4.0 + * 00-8-20 display fix for devices over 2GB - ptb v2.4.5 + * 00-8-23 more 64 bit fixes + error out overrange requests- ptb v2.4.6/2.2.27 + * 00-8-31 add ENBD_ERR ioctl to error out slot request- ptb v2.4.9 + * 00-8-31 soften ENBD_SOFT_RESET so doesn't wreck protocol - ptb v2.4.9 + * 00-9-1 remove %L's from printfs. Kernel 2.2. doesn't - ptb v2.4.10/2.2.27 + * 00-9-6 add various state flags to help init order - ptb v2.4.11 + * 00-9-8 add checks for device initialised to set_sock - ptb v2.4.12 + * 00-9-17 en/disable device as aslot count goes through 0 - ptb v2.4.13/2.2.28 + * 00-9-21 split read/write dev req counts for accounting - ptb v2.4.14 + * 00-9-21 renamed sync_intvl to req_timeo - ptb v2.4.14 + * 00-9-21 made sync_intvl count write blocks - ptb v2.4.14 + * 00-9-22 repair enable after delayed disable when disabled - ptb v2.4.14 + * 00-9-22 include sync (nonblocking) after sync_intvl reqs - ptb v2.4.14 + * 00-9-25 disable sync (nonblocking) after sync_intvl reqs - ptb v2.4.14 + * 00-9-25 bundle invalidate_buffers in clr_sock - ptb v2.4.14 + * 00-10-20 implement req_timeo per device + ioctl (Wang Gang) - ptb v2.4.15 + * 00-10-20 add raid mode (Wang Gang) - ptb v2.4.15 + * 00-10-26 throttle in do_req - ptb v2.4.15 + * 00-10-28 do set_sock on first open and clr_sock on last close - ptb v2.4.15 + * 00-11-01 make sync_intvl really sync - ptb v2.4.15 + * 00-11-14 rename throttle to plug, enbd_sync takes arg - ptb v2.4.17 + * 00-11-19 clr_sock errs req not rollback if show_errs & !aslot - ptb v2.4.17 + * 00-11-20 removed autodeadlock when disabled in do_req end_req - ptb v2.4.17 + * 00-11-21 make MY_NBD_SYNC only sync when sync_intvl > 0 - ptb v2.4.17 + * 00-12-24 make MY_NBD_GET_REQ use a timeout arg - ptb v2.4.18 + * 01-02-12 ported to 2.4.0 (works). do_enbd_request rewritten - ptb v2.4.20 + * 01-02-20 managed to get plugging and clustered read/writes OK - ptb v2.4.21 + * 01-02-21 eliminated slot->buflen for the time being - ptb v2.4.21 + * 01-02-27 added proper devfs support - ptb v2.4.22 + * 01-03-15 allowed more devices/in devfs, cleaned up endio - ptb v2.4.23 + * 01-03-15 added device letter (<= 3 chars) to struct- - ptb v2.4.23 + * 01-03-15 added request size check to do_enbd_req - ptb v2.4.23 + * 01-03-15 increased MAX_SECTORS to 512 by default - ptb v2.4.23 + * 01-03-15 made major number a module parameter - ptb v2.4.23 + * 01-03-18 added max_sectors array - ptb v2.4.23 + * 01-03-23 added devfs links - ptb v2.4.23 + * 01-04-17 plugging always enabled for 2.4 kernels - ptb v2.4.24 + * 01-04-17 made SET_RO set_device_ro as well as set local flags - ptb v2.4.25 + * 01-04-28 impl SET_MD5SUM ioctl and proc support for md5sum - ptb v2.4.25 + * 01-04-29 added accounting for md5'd reqs - ptb v2.4.25 + * 01-07-29 added atomic protections for accounting - ptb v2.4.25 + * 01-08-01 fixed 2.4 smp bugs. Interrupts off in spinlocks - ptb v2.4.25 + * 01-08-01 removed all semaphores for spinlocks - ptb v2.4.25 + * 01-08-01 invalidate_buffers in clr_sock (req'd Rogier Wolff) - ptb v2.4.25 + * 01-08-02 fixed smp deadlock - end_that_request_first slept! ptb v2.4.26 + * 01-10-16 provisionally added error in device open when notenabled ptb v2.4.27 + * 01-10-18 added DIRTY flag to save on repeated invalidate_buffers ptb v2.4.27 + * 01-10-31 increment seqno_out before delivery, so really starts at 1 v2.4.27 + * 01-11-01 move zeroing of seqno in cmd field to nbe_end_req* ptb v2.4.27 + * 01-11-18 add speed calculation, dev fields, display in proc ptb v2.4.27 + * 01-11-20 modifications for compiling into monolithic kernel ptb v2.4.27 + * 01-12-06 clr requests before reenabling, not after, in enbd_enable ptb 2.4.27 + * 02-02-21 make enbd_rollback modal, absirbing enbd_error ptb 2.4.27 + * 02-08-08 added local BLKSSZGET (reject) and related ioctls ptb 2.4.30 + * 02-08-12 make enbd_ack not ruin req when its rolled back already ptb 2.4.30 + * 02-09-18 fix __FUNCTION__ for new gcc ptb 2.4.30 + * 02-09-18 always allow daemon death even with reqs waiting ptb 2.4.30 + * 02-09-18 eliminate SYNC_REQD, RLSE_REQD ptb 2.4.30 + * 02-09-18 eliminate speed_lim ptb 2.4.30 + * 02-09-18 fix countq accounting ptb 2.4.30 + * 02-09-18 encapsulate remote ioctl handling ptb 2.4.30 + * 02-09-18 remote ioctl uses kernel req, not our fake one ptb 2.4.30 + * 02-09-18 eliminated ctldta use (too much tricky logic) ptb 2.4.30 + * 02-09-28 handle req specials ptb 2.4.30 + * 02-10-10 introduce DIRECT flag ptb 2.4.30 + * 02-10-13 rollback pushes reqs to local queue, not queues them! ptb 2.4.30 + * 02-10-13 add hooks for separate ioctl module ptb 2.4.30 + * 02-10-16 take set_sock out of open. Put pid check in handshake ptb 2.4.30 + * 02-10-16 define MY_NBD_GET_NPORT ioctl ptb 2.4.30 + * 02-10-18 remove wait from MY_NBD_SYNC ioctl ptb 2.4.30 + * 02-10-20 rollback adds requests to queue in seqno order ptb 2.4.30 + * 02-10-23 introduce and use pid_sem instead of req_sem ptb 2.4.30 + * 02-10-30 support client fallback to ioctls on whole disk ptb 2.4.30 + * 02-11-3 moved set INITIALISED up to coincide with setting inode ptb 2.4.30 + * 02-11-3 add media check and revalidate routines ptb 2.4.30 + * 02-11-4 encapuslate lives++ and ENABLED changes into enbd_enable ptb 2.4.30 + * 02-11-4 set_enable from proc only enables, not clears queue ptb 2.4.30 + * 11-11-4 take blk_put_request out of end_request (it locks!) ptb 2.4.30 + * 11-11-4 replace list_del by list_del_init ptb 2.4.30 + * 02-12-7 enbd_release made aware of daemons on whole disk ptb 2.4.30 + * 03-01-7 added ioctls for setfaulty etc. ptb 2.4.31 + * 03-02-1 used metalock for non-queue changes ptb 2.4.31 + * 03-03-12 add md_list notification ioctls ptb 2.4.31 + * 03-04-20 turn show_errs on under raid ptb 2.4.31 + * 03-05-06 add a magic-in-arg requirement to set sig on whole disk 2.4.31 + * 04-01-08 allow -ve md5 thresholds and let them mean "never". ptb 2.4.32 + * 04-01-9 localize md5sum thresholds to devices ptb 2.4.32 + * 04-01-11 add noauto for md5sum ptb 2.4.32 + * 04-01-12 beef up sysctl interface. separate defaults and devices ptb 2.4.32 + * 04-01-30 nbd_ack treats -ve reply.error as remote, +ve as local ptb 2.4.32 + * 04-01-30 check media uses remote check ioctl writeonly mode ptb 2.4.32 + */ + +#include +#ifndef UNIX98_PTY_MAJOR_COUNT + #define UNIX98_PTY_MAJOR_COUNT 8 + #ifndef UNIX98_NR_MAJORS + #define UNIX98_NR_MAJORS=UNIX98_PTY_MAJOR_COUNT + #endif +#endif + +#include + +#if defined(__GNUC__) && __GNUC__ >= 2 +#define _LOOSE_KERNEL_NAMES +#endif + +#include + +#include +#include +#include +#include +#include +#include + +#define MAJOR_NR NBD_MAJOR +static int major = MAJOR_NR; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) +#include +#endif /* defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) */ + +/* * + * PTB --------------- compatibility ------------------- * + * layer starts here. * + */ + + /* + * PTB BH_Protected disappeared somewhere around 2.4.10 but this is + * still needed for the very rare write local/read remote mode. DOn't + * worry about it in normal operation! + */ + #define mark_buffer_protected(rbh) \ + { \ + mark_buffer_dirty (rbh); \ + mark_buffer_uptodate (rbh, 1); \ + refile_buffer (rbh); \ + } + + /* PTB list interface extensions */ + #define list_head(ptr, type, member) \ + (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->next,type,member)) + #define list_tail(ptr, type, member) \ + (list_empty(ptr)?NULL:list_entry(((struct list_head *)ptr)->prev,type,member)) + + /* PTB for arches without the atomic mask ops (and no smp, I think!) + * - feel free to correct with assembler + */ + #ifndef atomic_set_mask + #define atomic_set_mask(mask, x) (x)->counter |= (mask) + #endif + #ifndef atomic_clear_mask + #define atomic_clear_mask(mask, x) (x)->counter &= ~(mask) + #endif + +/* * + * PTB --------------- compatibility ------------------- * + * layer ENDS here. * + */ +#if defined(MODULE) +int linux_version_code = LINUX_VERSION_CODE; +#endif +#include +#include +#include + +/* + * PTB kernel data - 4KB worth + * We need space for nda, nda1, .. nda15, ndb, ndb1, .. + * The index is exactly the minor number. + */ + static int enbd_blksizes[MAX_NBD * ENBD_MAXCONN]; + static int enbd_sizes[MAX_NBD * ENBD_MAXCONN]; + static __u64 enbd_bytesizes[MAX_NBD * ENBD_MAXCONN]; + static int enbd_max_sectors[MAX_NBD * ENBD_MAXCONN]; + +/* + * PTB our data - about 3KB + * These are nda, ndb, ndc, ... + * Divide the minor by ENBD_MAXCONN to get this index. + */ + static struct enbd_device enbd_dev[MAX_NBD]; + static struct enbd_md enbd_md; + struct enbd_ioctl_stub enbd_remote_ioctl; + + struct enbd_device * enbd_get(int i) { + return &enbd_dev[i]; + } + + #define ENBD_FAIL( s ) { \ + ENBD_DEBUG(1, s " (result %d).\n" , result ); \ + goto error_out; \ + } + #define ENBD_HARDFAIL( s ) { \ + ENBD_ERROR( s " (result %d).\n" , result ); \ + lo->harderror = result; \ + goto hard_error_out; \ + } + +/* + * PTB device parameters. These are module parameters too. + */ + + static int rahead = ENBD_RAHEAD_DFLT;/* PTB - read ahead blocks */ + static int sync = ENBD_SYNC_INTVL; /* PTB - sync every n secs/Kreqs */ + static int merge_requests /* PTB - bool, do request coalesce */ + = ENBD_MERGE_REQ_DFLT; + static int buf_sectors = ENBD_MAX_SECTORS; + /* PTB - user bufsize required */ + static int show_errs = 1; /* PTB - RAID mode? not usually */ + static int direct = 0; /* PTB - all opens are O_DIRECT */ + static int plug = ENBD_PLUG_DFLT; + + static int md5sum = 0; /* PTB - use md5summing write proto */ + static int md5_noauto = 0; /* PTB - allow md5summing proto */ + + static int md5_on_threshold = 1000; /* PTB - reqs reqd to turn md5 on */ + static int md5_off_threshold = 10; /* PTB - errs reqd to turn md5 off */ + static int req_timeo = ENBD_REQ_TIMEO; /* PTB - base timeout */ + + + +#ifndef NO_BUFFERED_WRITES + static int buffer_writes = 0; /* PTB - act like ramd on write */ +#endif /* NO_BUFFERED_WRITES */ + +#if defined(MODULE) + MODULE_PARM (rahead, "i"); + MODULE_PARM (sync, "i"); + MODULE_PARM (merge_requests, "i"); + MODULE_PARM (buf_sectors, "i"); + MODULE_PARM (show_errs, "i"); + MODULE_PARM (direct,"i"); + #ifndef NO_BUFFERED_WRITES + MODULE_PARM (buffer_writes, "i"); + #endif /* NO_BUFFERED_WRITES */ + MODULE_PARM (major, "i"); + MODULE_PARM (md5sum, "i"); + MODULE_PARM (md5_on_threshold, "i"); + MODULE_PARM (md5_off_threshold, "i"); + MODULE_PARM(req_timeo, "i"); +#endif + +#define NO_BUFFERED_WRITES 1 + +/* + * PTB pair of helpful additional functions, only good for 1 bit in the + * mask, however. Modify if you want more. + * + * @a the atomic element's address + * @mask the integer with one bit set in the position that we want to test + * and set, or clear + */ +static int +atomic_test_and_set_mask (atomic_t * a, unsigned mask) +{ + int i = ffs (mask); + if (!i) + return -EINVAL; + // PTB gahhhh ... + #ifdef __LITTLE_ENDIAN + return test_and_set_bit (i - 1, (unsigned long *)&a->counter); + #else + #ifndef __BIGENDIAN + #error help, I only know about bigendian or littlendian machines + #endif + return test_and_set_bit + (i - 1 + (sizeof(long)-sizeof(a->counter))*8, + (unsigned long *)&a->counter); + #endif +} +static int +atomic_test_and_clear_mask (atomic_t * a, unsigned mask) +{ + int i = ffs (mask); + if (!i) + return 0; + // PTB gahhhh ... + #ifdef __LITTLE_ENDIAN + return test_and_clear_bit (i - 1, (unsigned long *)&a->counter); + #else + #ifndef __BIGENDIAN + #error help, I only know about bigendian or littlendian machines + #endif + return test_and_clear_bit + (i - 1 + (sizeof(long)-sizeof(a->counter))*8, + (unsigned long *)&a->counter); + #endif +} + +/* * + * PTB --------------- functions ----------------------- * + */ + +/* + * PTB + * pick the hidden enbd type out of a req @req + */ +static struct enbd_device* +rq_get_enbd(struct request *req) { + return req->rq_disk ? req->rq_disk->private_data : NULL; +} + +/* + * PTB + * Decode the request type of a request and return it. DOn't we + * have anywhere else to put this? Yes, in private data. But + * that's just a pointer to our device data so we don't use it. + * + * we use the low bit (REQ_RW) of the flags and the special bit + * to designate the type of request. + * + * @req the request to get the type of. + */ + +static inline int +rq_type (struct request *req) +{ + switch ( ((req->flags & REQ_RW) ?1:0) + | ((req->flags & REQ_SPECIAL)?2:0) + ) { + case 0: + return READ; + case 1: + return WRITE; + case 2: + return MD5SUM; + case 3: + return IOCTL; + } + // PTB report strangeness if it is strange + return -1; +} + +/* + * PTB code the request type into a request. + * + * This appears to be only used when making an ioctl request and it + * never really escapes from our private area and it doesn't matter too + * much how efficient it is either. + * + * This function marks a request for conventional viewing as + * being of the designated conceptual type. It correspomds to the old + * "type" field in requests. + * + * @req the request to set the type on + * @type one of READ, WRITE, etc. + */ +static void +set_rq_type (struct request *req, int type) +{ + switch (type) { + case READ: // RW=0 SPECIAL=0 + req->flags &= ~(REQ_RW | REQ_SPECIAL); + return; + case WRITE: // RW=1 SPECIAL=0 + req->flags &= ~REQ_SPECIAL; + req->flags |= REQ_RW; + return; + case IOCTL: // RW=1 SPECIAL=1 + req->flags |= REQ_SPECIAL; + req->flags |= REQ_RW; + return; + case MD5SUM: // RW=0 SPECIAL=1 + req->flags |= REQ_SPECIAL; + req->flags &= ~REQ_RW; + return; + } +} + +/* + * PTB count number of blocks in a request. This will be an overestimate + * if the number is not an exact multiple. It seems to happen. We + * guarrantee to return -ve only if the request is invalid. + * + * @req - request we want to count + */ +inline long +rq_nr_blks (struct request *req) +{ + unsigned log_sectors_per_blk; + unsigned sectors_per_blk; + int size; + int sectors; + struct enbd_device *lo; + + if (!req) + return -EINVAL; + + lo = rq_get_enbd(req); + + if (!lo) + return -EINVAL; + + log_sectors_per_blk = lo->logblksize - 9; + sectors_per_blk = 1 << log_sectors_per_blk; + + sectors = req->nr_sectors; + size = (sectors + sectors_per_blk - 1) >> log_sectors_per_blk; + + return size; +} + +/* + * return a temporary buffer containing the (1 or 2 char) device letter. + * This works for i up to 26*26. 0 is "a". The buffer is zero + * terminated. + * + * @i number to be translated to x[y] alphabetical form. + */ +static char * +device_letter (int i) +{ + + static char buf[3]; + static int cached_i = -1; + + if (cached_i == i) + return buf; + + cached_i = i; + + if (i < 26) { + buf[0] = 'a' + i; + buf[1] = 0; + return buf; + } + + buf[0] = 'a' + i / 26; + buf[1] = 'a' + i % 26; + buf[2] = 0; + return buf; +} + +/* + * PTB auxiliary functions for manipulating the sequence number. Isn't + * there anything private we can use in a request? + * + * This function returns the sequno + * + * @req the request to get the sequence number of + */ +static int +rq_seqno (struct request *req) +{ + return (int)req->special; +} +static void +rq_set_seqno (struct request *req, int val) +{ + req->special = (char *)val; +} + +/* + * PTB sync the device. Modes: + * @arg = 1: Do it sync + * @arg = 0: Do it async + * + * We can't call sync_dev outside a process context. I don't know why. + * Death results from a scheduled attempt. + * + * Call without the semaphore held, as we lock it and call sync_dev. + */ +static void +enbd_sync (struct enbd_device *lo, long arg) +{ + struct inode *inode = lo->inode; + short minor, nbd, islot; + + islot = atomic_read (&lo->islot); + + if (!(atomic_read (&lo->flags) & ENBD_INITIALISED) || !inode) { + goto fail; + } + + minor = iminor (inode); + nbd = minor >> ENBD_SHIFT; + + // PTB sync_dev is async. fsync_dev is sync. + switch (arg) { + case 0: // async + // PTB 2.5.7 does not have async sync! FIXME + break; + default: // sync + fsync_bdev (inode->i_bdev); + break; + } + + return; + + fail: + return; +} + +static void +enbd_async_sync (struct enbd_device *lo) +{ + enbd_sync (lo, 0); +} +static void +enbd_sync_sync (struct enbd_device *lo) +{ + enbd_sync (lo, 1); +} + +/* + * Do sync async if we're enabled, sync if we're not. + * + * @lo the device to maybe sync (sync or async sync!) + */ +static void +enbd_maybe_sync_sync (struct enbd_device *lo) +{ + + if ((atomic_read (&lo->flags) & ENBD_ENABLED) + && !(atomic_read (&lo->flags) & ENBD_REMOTE_INVALID)) { + enbd_async_sync (lo); + return; + } + enbd_sync_sync (lo); +} + + + + +/* + * PTB - put a request onto the head of a nbd device's queue + * - presumably having taken it off the kernel's queue first! + * - We take the queue spinlock. + * + * @lo = the device we are on (could we get it from the req?) + * @req = the request we shift + * @irqsave = save and restore irqmask when taking our queue spinlock + */ +static void +enbd_enqueue (struct enbd_device *lo, struct request *req) +{ + unsigned long req_blks = rq_nr_blks (req); + + if (req_blks < 0) { + short islot = atomic_read (&lo->islot); + ENBD_ERROR ("(%d): invalid req %p. Not touching!\n", + islot, req); + return; + } + + if (1) { + + int countq; + int cmd; + + cmd = rq_data_dir (req); + atomic_add (req_blks, &lo->requests_in[cmd]); + + // PTB do we need locks here? Apparently not. + atomic_inc (&lo->countq[cmd]); + countq = atomic_read (&lo->countq[cmd]); + + // PTB the maxes are just noncritical stats + if (atomic_read (&lo->maxq[cmd]) < countq) + atomic_set (&lo->maxq[cmd], countq); + atomic_inc (&lo->req_in[cmd][req_blks]); + // PTB the maxes are just noncritical stats + if (atomic_read (&lo->maxreqblks) < req_blks) + atomic_set (&lo->maxreqblks, req_blks); + } + + /* PTB accounting and nothing more - first, specials */ + write_lock (&lo->queue_lock); + + list_add (&req->queuelist, &lo->queue); + + write_unlock (&lo->queue_lock); + + wake_up_interruptible (&lo->wq); + +} + +/* + * PTB - remove a request from anywhere in the nbd device general queue + * - return 0 for success, -ve for fail + * + * We need to hold the queue lock when calling this routine. + * It walks the queue. + * + * @lo the nbd device + * @req the request to be removed + */ +static int +enbd_remove (struct enbd_device *lo, struct request *req) +{ + + if (!req) + return -EINVAL; + + list_del_init (&req->queuelist); + + /* PTB accounting and nothing more */ + if (1) { + int cmd = rq_data_dir (req); + atomic_dec (&lo->countq[cmd]); + } + return 0; +} + +/* + * PTB - Open the device. This is the blkops function. + */ +int +enbd_open (struct inode *inode, struct file *file) +{ + int dev; + struct enbd_device *lo; + int nbd; + int part; + int islot; + short do_media_check = 0; + + if (!inode && file) { /* added by ptb for 2.0.35. Necessary? */ + inode = file->f_dentry->d_inode; + } + if (!inode) { + ENBD_ERROR ("null inode.\n"); + return -EINVAL; + } + + dev = iminor (inode); + nbd = dev >> ENBD_SHIFT; + part = dev - (nbd << ENBD_SHIFT); + islot = part - 1; + + if (nbd >= MAX_NBD) { + ENBD_ERROR ("too many (%d) whole devices open\n", nbd); + return -ENODEV; + } + + lo = &enbd_dev[nbd]; + + if (0 && (atomic_read(&lo->flags) & ENBD_REMOTE_INVALID) + && !(file->f_flags & O_NONBLOCK)) { + ENBD_ERROR ("exited NODEV because nd%s remote invalid\n", + lo->devnam); + return -ENODEV; + } + + /* PTB provision for opening for direct i/o - gives mount aid */ + if (file + && (atomic_read(&lo->flags) & ENBD_DIRECT) + && !(file->f_flags & O_DIRECT)) { + /* PTB we set NOFOLLOW to show we did it ! */ + file->f_flags |= O_DIRECT | O_NOFOLLOW; + } + + if (part == 0) { + + if (!lo->file || lo->file != file) { + // PTB have got whole dev'inode for 1st time */ + lo->file = file; + atomic_set (&(&lo->wspeed)->frstj, jiffies); + atomic_set (&(&lo->rspeed)->frstj, jiffies); + atomic_set (&(&lo->tspeed)->frstj, jiffies); + } + if (!lo->inode || lo->inode != inode) { + lo->inode = inode; + } + if (!(atomic_test_and_set_mask (&lo->flags, ENBD_INITIALISED))){ + } + } + + atomic_inc (&lo->refcnt); + + + // PTB FIXME - should this be here. If not validated, read + // parts. + if ((atomic_read (&lo->flags) & ENBD_ENABLED) + && ! (atomic_read (&lo->flags) & ENBD_VALIDATING) + && lo->aslot > 0) { + + long timeout = lo->req_timeo > 0 ? lo->req_timeo * HZ : 60 * HZ; + struct enbd_slot *slot = &lo->slots[islot]; + + if ( !(atomic_read (&lo->flags) & ENBD_VALIDATED)) { + + /* + * PTB do we set VALIDATED here, or let the kernel call + * sequence result in it happening via our removable + * device routines? Let's go for the latter option. + */ + do_media_check = 1; + } + + if ((part == 0 || (slot->pid && slot->pid != current->pid)) + && lo->last_checked + timeout < jiffies) { + /* + * PTB induce a remote check and maybe invalidate device. + * This currently requires enbd_ioctl for remote ioctl + * support so we don't bother to check the result of + * the call here. When we open the device, we check it. + */ + do_media_check = 1; + } + + if (do_media_check) { + // PTB this invalidates buffers, if necessary + ENBD_INFO ("DISK CHECK wanted in open on device nd%s\n", + lo->devnam); + lo->last_checked = jiffies; + check_disk_change(inode->i_bdev); + } + } + + return 0; +} + +static void +enbd_end_ioctl_request(struct request *req) { + + struct enbd_ioctl_info *ioctl_info; + struct enbd_device *lo; + + if (!req) + return; + + // PTB get rid of the request, saving data to ioctl info + + // PTB don't get rid of the buffer, just null the reference + req->buffer = NULL; + + ioctl_info = req->special; + // PTB transfer error value to info structure for safekeeping + if (!ioctl_info) { + ENBD_ERROR("received ioctl req %p without ioctl info\n", req); + } else { + ioctl_info->errors = req->errors; + } + + // PTB don't get rid of ioctl_info either! Just null the reference + req->special = NULL; + req->data = NULL; + req->data_len = 0; + /* + * PTB decrement the count we incremented in make_req to hold + * it stable till after completion + */ + req->ref_count--; + + // PTB remove any ENBD stigmata + //req->flags &= (1 << __REQ_NR_BITS) - 1; // FIXME + + lo = rq_get_enbd(req); // get befoore complete to avoid race + + // PTB notify waiters + complete(req->waiting); + + // PTB tell ioctl_make_req that something has happened + if (lo) { + wake_up(&lo->req_wq); + } else { + ENBD_ALERT ("ioctl req %p with no lo received\n", req); + } + + // PTB leave put_request to the waiter and leave ioctl_info on +} + +/* + * PTB - complete a transaction irrefutably by taking it out of the + * - slot pending position it is in, and reporting end_request to kernel + * + * We are called without locks because our call to end request + * will take some sort of lock momentarily and we don't need + * locks because our request should already be off all queues. + * + * @slot the enbd_slot on which the req notionally was + * @req the poor defenceless kernel request about to be acked + */ +void +enbd_commit (struct enbd_slot *slot, struct request *req) +{ + + struct enbd_device *lo = slot->lo; + unsigned long req_blks = rq_nr_blks (req); + int cmd; + + if (req_blks < 0) { + ENBD_ERROR ("corrupted req %p. Not touching with bargepole.\n", + req); + return; + } + + list_del_init (&req->queuelist); + + cmd = rq_data_dir (req); + + switch (rq_type(req)) { + case IOCTL: + /* PTB just does completion, kfree contents in the rendezvous */ + enbd_end_ioctl_request(req); + req->buffer = NULL; + req->special = NULL; + req->data = NULL; + req->data_len = 0; + break; + default: + enbd_end_request_lock (req); + break; + } + // PTB remove the ioctl markup + req->flags &= ~REQ_SPECIAL; + blk_put_request (req); + + slot->req_age = 0; + + /* PTB only accounting and nothing more below here */ + slot->req -= req_blks; + atomic_sub (req_blks, &lo->requests_req[cmd]); + + /* PTB request with errors finishes accounting here */ + if (req->errors != 0) { + /* PTB error exit */ + atomic_add (req_blks, &lo->requests_err); + slot->err += req_blks; + return; + } + + /* PTB good request accounting */ + atomic_add (req_blks, &lo->requests_out[cmd]); + slot->out += req_blks; + + if (cmd != WRITE) + /* PTB everything but a write was easy */ + return; + + /* + * PTB now non error case writes accounting + * + * account the 4 cases for a md5sum'd transaction + */ + + switch (atomic_read(&slot->flags) & (ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK)) { + + case ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK: + atomic_add (req_blks, &lo->wrequests_5to); // 11 + atomic_add (req_blks, &lo->wrequests_5so); + // PTB zero the countdown to turning off md5 as it works + atomic_set (&lo->wrequests_5co, 0); + break; + + case ENBD_SLOT_MD5SUM: + atomic_add (req_blks, &lo->wrequests_5to); // 10 + atomic_add (req_blks, &lo->wrequests_5wo); + atomic_inc (&lo->wrequests_5co); + if (!(atomic_read(&lo->flags) & ENBD_MD5SUM_NOAUTO) && + (lo->md5_off_threshold >= 0 && + atomic_read (&lo->wrequests_5co) > lo->md5_off_threshold)) { + atomic_set (&lo->wrequests_5co, 0); + // PTB turn off md5summing as it is not successful + atomic_clear_mask (ENBD_MD5SUM, &lo->flags); + } + break; + + case ENBD_SLOT_MD5_OK: + atomic_add (req_blks, &lo->wrequests_5to); // 01 + atomic_add (req_blks, &lo->wrequests_5eo); + atomic_inc (&lo->wrequests_5co); + if (!(atomic_read(&lo->flags) & ENBD_MD5SUM_NOAUTO) && + (lo->md5_off_threshold >= 0 && + atomic_read (&lo->wrequests_5co) > lo->md5_off_threshold)) { + atomic_set (&lo->wrequests_5co, 0); + // PTB turn off md5summing as it is errored + atomic_clear_mask (ENBD_MD5SUM, &lo->flags); + } + break; + + default: + case 0: + // PTB nobody asked for a md5 and nobdy gave one back + atomic_inc (&lo->wrequests_5no); + if (!(atomic_read(&lo->flags) & ENBD_MD5SUM_NOAUTO) && + (lo->md5_on_threshold >= 0 && + atomic_read (&lo->wrequests_5no) > lo->md5_on_threshold)) { + atomic_set (&lo->wrequests_5no, 0); + // PTB turn on md5summing every so often + atomic_set_mask (ENBD_MD5SUM, &lo->flags); + } + break; + } + + // PTB clear the md5sum indicators from the slot afterwards! + atomic_clear_mask((ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK), &slot->flags); + + // PTB we ran out of difficult cases, so return +} + +/* + * PTB - error out a transaction irrefutably by taking it out of the + * - slot pending position it is in, and reporting end_request to kernel + * + * We must be called without spinlocks held, as we take it in end req + * + * @slot the enbd_slot on which the req notionally was + * @req the poor defenceless kernel request about to be errored + */ +void +enbd_error (struct enbd_slot *slot, struct request *req) +{ + struct enbd_device *lo = slot->lo; + unsigned long req_blks = rq_nr_blks (req); + int cmd; + + if (req_blks < 0) { + ENBD_ERROR ("passed illegal request %p\n", req); + } + + if (req->errors >= 0) + req->errors++; + + // PTB take lock just long enough to get request off queue + spin_lock(&slot->lock); + list_del_init (&req->queuelist); + spin_unlock(&slot->lock); + + ENBD_ALERT ("error out req %p from slot %d!\n", req, slot->i); + + cmd = rq_data_dir (req); + + switch (rq_type(req)) { + case IOCTL: + ENBD_ALERT("erroring and putting ioctl req %p\n", req); + enbd_end_ioctl_request(req); + // PTB that leaves the ioctl info intact. The waiter + // on the req completion will have to deal with it. + break; + default: + enbd_end_request_lock (req); + break; + } + // PTB remove any ioctl markup + req->flags &= ~REQ_SPECIAL; + blk_put_request (req); + + /* PTB accounting and nothing more */ + + atomic_sub (req_blks, &lo->requests_req[cmd]); + + slot->in -= req_blks; + slot->req -= req_blks; + + slot->req_age = 0; + slot->err += req_blks; + atomic_add (req_blks, &lo->requests_err); +} + +/* + * Take a request out of a slot. This must not hold the queuelock on + * entry as we take the queue lock in order to play with the devices + * queue. + * + * @slot the nbd slot on which to work + * @req the request + */ +static void +enbd_rollback (struct enbd_slot *slot, struct request *req) +{ + + struct enbd_device *lo = slot->lo; + unsigned long req_blks, flags; + int seqno; + struct list_head *pos; + struct request *xreq; + + if ((atomic_read (&lo->flags) & ENBD_SHOW_ERRS) + || req->errors < 0 // remote error + || rq_type(req) == IOCTL) { + // PTB error instead of rollback (errors < 0 is remote error) + enbd_error (slot, req); + return; + } + + req_blks = rq_nr_blks (req); + + if (req_blks < 0) { + ENBD_ERROR ("malformed request %p\n", req); + return; + } + + // PTB take lock just long enough to get request off queue + spin_lock(&slot->lock); + list_del_init (&req->queuelist); + spin_unlock(&slot->lock); + + ENBD_ALERT ("(%d): rollback req %p!\n", slot->i, req); + + if (1) { + // PTB accounting + slot->in -= req_blks; + slot->req -= req_blks; + } + + seqno = rq_seqno(req); + + write_lock_irqsave(&lo->queue_lock, flags); + list_for_each_prev (pos, &lo->queue) { + xreq = list_entry (pos, struct request, queuelist); + if (rq_seqno(xreq) > seqno) { + break; + } + } + list_add_tail (&req->queuelist, pos); + write_unlock_irqrestore(&lo->queue_lock, flags); + +} + +/* + * PTB - undo transactions by taking them out of the slot pending + * - position and replacing them on the generic device queue + * - NB we do not hold the io request lock or queue sem when + * - calling this as we take it internall in enbd_rollback + * + * @slot the nbd slot to scan + */ +static void +enbd_rollback_all (struct enbd_slot *slot) +{ + + struct request *req; + short count = 0; + + spin_lock(&slot->lock); + while (!list_empty (&slot->queue)) { + + if (count++ > 1000) + break; + + req = list_head (&slot->queue, struct request, queuelist); + + if (!req) + break; + + spin_unlock(&slot->lock); + enbd_rollback (slot, req); + spin_lock(&slot->lock); + } + spin_unlock(&slot->lock); + +} + +/* + * PTB error out all the requests on a slot + * + * We must be called without the io spinlock held, as we take it in + * enbd_error(). + * + * @slot the nbd slot to scan + */ +static void +enbd_error_all (struct enbd_slot *slot) +{ + + struct request *req; + short count = 0; + + spin_lock(&slot->lock); + while (!list_empty (&slot->queue)) { + if (count++ > 1000) + break; + req = list_head (&slot->queue, struct request, queuelist); + if (!req) + break; + spin_unlock(&slot->lock); + enbd_error (slot, req); + spin_lock(&slot->lock); + } + spin_unlock(&slot->lock); +} + +/* + * PTB - let a request onto the slot pending position + * - Can be called without the spinlock and doesn't take the + * spinlock as we only deal with our unique slot. If there + * were more than one client per slot this woould be a problem + * but there aren't so it isn't. + * + * @slot the nbd slot to let the request onto + * @req the request to move onto the slot queue + */ +void +enbd_accept (struct enbd_slot *slot, struct request *req) +{ + + struct enbd_device *lo = slot->lo; + unsigned long req_blks = rq_nr_blks (req); + int cmd; + + if (req_blks < 0) + return; + + /* PTB accounting and nothing more */ + cmd = rq_data_dir (req); + + atomic_add (req_blks, &lo->requests_req[cmd]); + /* PTB - Note that this really is slot and not lo. + */ + spin_lock(&slot->lock); + list_add (&req->queuelist, &slot->queue); + + slot->req_age = jiffies; + slot->in += req_blks; + slot->req += req_blks; + spin_unlock(&slot->lock); +} + +/* + * PTB - read from userspace to a request buffer. Do it piecewuse + * - to cope with clustered requests. + * - return number of bytes read + * + * Unfortunately the only way we can return less than the right + * number of bytes is when the receiving req does not have the + * right number of buffers, because the copy_from_user itself + * doesn't tell us. + */ +static int +copy_from_user_to_req (struct request *req, char *user, int len) +{ + + unsigned size = 0; + struct bio *bio /* = req->bio */; + + /* PTB assume user verified */ + + rq_for_each_bio(bio, req) { + + int i; + struct bio_vec * bvl; + + bio_for_each_segment(bvl, bio, i) { + + struct page *page = bvl->bv_page; + int offset = bvl->bv_offset; + const unsigned current_size + = bvl->bv_len; + char *buffer; + int rem; + + buffer = page_address(page) + offset; + + rem = copy_from_user (buffer, user + size, current_size); + + size += current_size - rem; + if (rem > 0) + break; + } + } + if (size != len) { + ENBD_ALERT ("requested %d and only read %d bytes to req %p\n", + len, size, req); + ENBD_ALERT ("request %p wanted to read user space buffer %p\n", + req, user); + } + return size; +} + +static int +indirect_ioctl_store (struct enbd_ioctl_info *ioctl_info, char * buf) +{ + int err; + struct enbd_ioctl * remote_ioctl = enbd_remote_ioctl.remote; + char * arg; + long cmd; + int size; + + if (!remote_ioctl) + return -EINVAL; + + if (!ioctl_info) + return -EINVAL; + // PTB if we are reading, it should be to the local buffer + // PTB the buffer points at a kmalloced area + + arg = (char *)ioctl_info->arg; + cmd = ioctl_info->cmd; + size = ioctl_info->size; + + if (size <= 0) + return size; + + if (!arg) { + ENBD_ERROR("cannot copy to user addr %p!\n", arg); + return -EFAULT; + } + err = remote_ioctl->cp_to_user (cmd, arg, buf, size); + // kfree (req->buffer); + if (err < size) { + ENBD_ERROR("failed (%d) copy to user of %d bytes\n", err, size); + return -EFAULT; + } + return size; +} + + +/* + * PTB - andres' kernel half of the user-space network handshake, used + * - to complete a transaction. + * - return 0 for success and -ve for fail. + * + * @slot the nbd slot being acted on + * + */ +int +enbd_ack (struct enbd_slot *slot) +{ + struct enbd_reply reply; + struct request *req, *xreq; + int result = 0; + + void *user; + unsigned long req_blks = 1; + struct enbd_device *lo = slot->lo; + unsigned buflen = 0; + unsigned reqlen; + int cmd = -1; + struct list_head *pos; + int count = 0; + + if (!(atomic_read(&slot->flags) & ENBD_SLOT_BUFFERED)) { + return -ENODEV; + } + + atomic_inc (&lo->cthreads); + atomic_set_mask(ENBD_SLOT_RUNNING, &slot->flags); + + spin_lock(&slot->lock); + slot->cli_age = jiffies; + user = slot->buffer; + spin_unlock(&slot->lock); + + // PTB FIXME. Surely the req can rollback under us? + if (copy_from_user ((char *) &reply, (char *) user, sizeof (reply))) { + return -EFAULT; + } + + if (reply.magic != ENBD_REPLY_MAGIC) { + /* + * PTB we got a matching request, but it's corrupted. Best + * throw the reply away and leave our request to age. + */ + atomic_inc(&slot->nerrs); + if (atomic_read(&slot->nerrs) <= 3) { + ENBD_ALERT ("Not enough reply magic for req\n"); + } + /* + * PTB returning -EAGAIN causes the client to pause 0.5s + * and throw its reply away, then return to service. We leave + * any request we have to age and be rolled back. + */ + return -EAGAIN; + } + + // PTB we keep tracking the write position in the input buffer + buflen += ENBD_BUFFER_DATA_OFFSET; + + // PTB save the reply handle (which is an address) as our req + memcpy (&req, &reply.handle, sizeof (req)); + // PTB FIXME maybe the req can vanish while we copy? + + xreq = NULL; + spin_lock(&slot->lock); + list_for_each (pos, &slot->queue) { + xreq = list_entry (pos, struct request, queuelist); + if (count++ > 1000) + break; + if (xreq == req) + /* PTB found it */ + break; + } + + // PTB have the slot queue lock + + if (xreq != req) { + + // PTB not found. Give up and return + spin_unlock(&slot->lock); + + atomic_inc(&slot->nerrs); + if (atomic_read(&slot->nerrs) <= 3) { + ENBD_ALERT ("(%d): fatal: Bad handle %p != %p!\n", + slot->i, req, xreq); + } + + atomic_dec (&lo->cthreads); + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + + ENBD_ALERT("(%d): ignoring ack of req %p which slot lacks\n", + slot->i, req); + + /* + * PTB we lie and say success because userspace got through to + * us OK and the req they missed has been rolled back and will + * be retransmitted by the kernel later and elsewhere + */ + return 0; + } + + spin_unlock(&slot->lock); + + /* PTB we leave the request on the slot here */ + + if (reply.error > 0) { + + /* PTB wasn't error++'ed before */ + int errors; + + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + errors = req->errors; + req->errors = -reply.error; + spin_unlock(&slot->lock); + + atomic_inc(&slot->nerrs); + ENBD_ALERT ("error (%d) reply rcvd, err #%d, to req %p\n", + reply.error, errors, req); + + /* PTB we handle this - it's a remote error */ + result = 0; + /* PTB transmit the error to the request as a low negative. */ + ENBD_FAIL ("remote error on request"); + } + + if (reply.error < 0) { + /* PTB something wrong with the transmission system. Rollback */ + int errors; + + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + if ((errors = req->errors) >= 0) + req->errors++; + spin_unlock(&slot->lock); + + atomic_inc(&slot->nerrs); + if (atomic_read(&slot->nerrs) <= 3) { + ENBD_ALERT ("fail on reply err %d, req err %d for req %p\n", + reply.error, errors, req); + } + + result = -EINVAL; + ENBD_FAIL ("transmission error on request"); + } + + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + req_blks = rq_nr_blks (req); + reqlen = req->nr_sectors; + reqlen <<= 9; + cmd = rq_type (req); + spin_unlock(&slot->lock); + + switch (cmd) { + + int size; + struct enbd_ioctl_info *ioctl_info; + long ioctl_cmd; + + case READ: + /* + * PTB We have to copy the buffer bit by bit in + * case the request is clustered. + */ + size = + copy_from_user_to_req (req, ((char *) user) + buflen, reqlen); + if (size < reqlen) { + ENBD_ALERT + ("(%d): copy %dB from user to req %p failed (%d)\n", + slot->i, reqlen, req, size); + // PTB we could try again? We should investigate. + result = -EBADF; + + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + if (req->errors >= 0) { + if (req_blks > 0) + req->errors += req_blks; + else + req->errors += 1; + } + spin_unlock(&slot->lock); + + ENBD_FAIL ("exited because of bad copy from user"); + // PTB FIXME - think we want to discard and retry + } + + // PTB we keep tracking the write position in the buffer + buflen += size; + break; + + case WRITE: + /* + * PTB we want to know if the reply is md5summed, and if it is + * whether the md5sum is the same as the one on the + * request. But that's not something we can presently see + * from here as we don't make an md5sum in the kernel. + * So we have to rely on the reply flag from userspace. + * We transmit the information to the slot, as we can't + * keep it on the request. + */ + + switch (reply.flags & + (ENBD_REPLY_MD5SUM | ENBD_REPLY_MD5_OK)) { + + case ENBD_REPLY_MD5SUM | ENBD_REPLY_MD5_OK: + /* + * PTB we asked for an md5sum comparison and + * the two matched, so we skipped writing the request + */ + atomic_set_mask((ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK), + &slot->flags); //11 + break; + case ENBD_REPLY_MD5SUM: + // PTB the two differed, so we wrote the request + atomic_set_mask(ENBD_SLOT_MD5SUM, &slot->flags); + atomic_clear_mask(ENBD_SLOT_MD5_OK, &slot->flags); // 10 + break; + case ENBD_REPLY_MD5_OK: + // PTB the server refused the md5 request + atomic_clear_mask(ENBD_SLOT_MD5SUM, &slot->flags); + atomic_set_mask(ENBD_SLOT_MD5_OK, &slot->flags); // 01 + break; + default: + case 0: + // PTB mobody asked for an md5sum comparison + atomic_clear_mask((ENBD_SLOT_MD5SUM | ENBD_SLOT_MD5_OK), + &slot->flags); // 00 + break; + } + // PTB now we are all set up to do the accounting in commit etc. + break; + + case IOCTL: + + // PTB the commit should emit the request notification + + ioctl_info = req->special; + + if (!ioctl_info) { + result = -EINVAL; + ENBD_FAIL ("corrupt remote ioctl rq without ctl data"); + } + ioctl_cmd = ioctl_info->cmd; + + if (!(reply.flags & ENBD_REPLY_IOCTL)) { + ENBD_ALERT ("ioctl (%#lx) reply to req %p missing ioctl flag\n", + ioctl_cmd, req); + } + + + if (ioctl_cmd == -1l) { + + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + req->errors = result = -EINVAL; + spin_unlock(&slot->lock); + + ENBD_FAIL ("unauthorized remote ioctl"); + } + + /* + * PTB We saved ioctl size in req .. but only approximately, + * as nr_sectors. + */ + + /* + * PTB if we are reading, it is from userspace to the local + * kernel buffer arg + */ + + // PTB we are treating a saved local address or direct val + if ((_IOC_DIR (ioctl_cmd) & _IOC_READ)) { + + int size = ioctl_info->size; + + if (size > 0) { + /* + * PTB sectors is an overestimate. Should be + * OK as we are reading from the client + * buffer which has plenty of room to spare. + */ + char * buf = req->buffer; + + if (copy_from_user (buf, (char *) user + buflen, size)) { + result = -EFAULT; + // PTB use slot lock to guard req + spin_lock(&slot->lock); + if (req->errors > 0) + req->errors++; + else + req->errors = -EFAULT; + spin_unlock(&slot->lock); + ENBD_FAIL ("remote ioctl failed store"); + } + buflen += size; + } + } + + break; + } // PTB eswitch + goto success; + + success: + atomic_set(&slot->nerrs, 0); + /* + * PTB - completion (or erroring) of transaction. + * note that enbd_commit will take a lock to do end_req + */ + enbd_commit (slot, req); + // PTB commit will not have put_req an IOCTL - we do it later + atomic_dec (&lo->cthreads); + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + return 0; + + error_out: + /* PTB we will next do a client rollback on the slot from userspace. + * Right here we just skip the request. + * But .. don't error the request. We might have rolled it + * back and be referencing it. + */ + // PTB use slot lock to guard req since we're on it + spin_lock(&slot->lock); + if (req->errors < 0) { + spin_unlock(&slot->lock); + /* PTB remote error, remote connection is ok */ + atomic_set(&slot->nerrs, 0); + enbd_error (slot, req); + atomic_dec (&lo->cthreads); + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + return 0; + } + if (result != -EAGAIN && result != 0) { + if (req->errors >= 0) + req->errors += req_blks; + slot->err += req_blks; + } + spin_unlock(&slot->lock); + + if (cmd == IOCTL) { + // PTB we need to finish a ioctl right here, we won't rollback + enbd_error(slot, req); + // PTB that ran the completion and alerted the waiter + } + // PTB one client thread leaves + atomic_dec (&lo->cthreads); + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + return result; +} + +/* + * PTB - write to userspace from a request buffer. Do it piecewuse + * - to cope with clustered requests. + * - return number of bytes written + */ +static int +copy_to_user_from_req (struct request *req, char *user, int len) +{ + + unsigned size = 0; + struct bio *bio /* = req->bio */; + + /* PTB assume user verified */ + + rq_for_each_bio(bio, req) { + + int i; + struct bio_vec * bvl; + + bio_for_each_segment(bvl, bio, i) { + + struct page *page = bvl->bv_page; + int offset = bvl->bv_offset; + const unsigned current_size + = bvl->bv_len; + char *buffer; + int rem; + + buffer = page_address(page) + offset; + + rem = copy_to_user (user + size, buffer, current_size); + + size += current_size - rem; + if (rem > 0) + break; + } + + } + return size; +} + +/* + * PTB do the devices three speed updates + * + * @lo the nbd device to do the update on + */ +static void +enbd_set_speed (struct enbd_device *lo) +{ + int r, w, t; + struct enbd_speed *wspd = &lo->wspeed; + struct enbd_speed *rspd = &lo->rspeed; + struct enbd_speed *tspd = &lo->tspeed; + w = atomic_read (&lo->requests_in[WRITE]); + wspd->update (wspd, w); + r = atomic_read (&lo->requests_in[READ]); + rspd->update (rspd, r); + t = w + r; + tspd->update (tspd, t); +} + + + +/* + * PTB - andres' kernel half of the userspace networking. This part + * - initiates the transaction by taking a request off the generic + * - device queue and placing it in the slots pending position. + * - I believe we return 0 for success and -ve for fail. + * - timeo is the number of jiffies we are prepared to wait + * + * @slot the nbd slot to act on. + */ +int +enbd_get_req (struct enbd_slot *slot) +{ + struct enbd_request request; + struct request *req; + int result = 0; + static atomic_t count; + unsigned start_time = jiffies; + struct enbd_device *lo = slot->lo; + unsigned timeout = lo->req_timeo * HZ; + int islot = slot->i; + // PTB for the new timezone field in requests + extern struct timezone sys_tz; + struct timeval time; + unsigned long flags; + struct enbd_seqno * seqno_out = &lo->seqno_out; + + atomic_inc (&lo->cthreads); // PTB - client thread enters + atomic_set_mask(ENBD_SLOT_RUNNING, &slot->flags); + slot->cli_age = jiffies; + + if (!(atomic_read(&slot->flags) & ENBD_SLOT_BUFFERED)) { + ENBD_FAIL ("Our slot has no buffer"); + } + + atomic_set (&lo->islot, islot); + + spin_lock(&slot->lock); + if (!list_empty (&slot->queue)) { + + __s64 sector; + unsigned len; + int type; + char * typename; + + req = list_tail (&lo->queue, struct request, queuelist); + sector = req->sector; + len = req->nr_sectors; + type = rq_type (req); + switch (type) { + case IOCTL: typename = "ioctl"; + case READ: typename = "read"; + case WRITE: typename = "write"; + case MD5SUM: typename = "md5sum"; + default: typename = "unknown"; + } + spin_unlock(&slot->lock); + ENBD_ERROR("req %p type %s sectors %Ld-%Ld still on slot nd%s%d\n", + req, typename, sector, sector + len - 1, lo->devnam, islot); + result = -EBUSY; + ENBD_FAIL ("impossible! already treating one request"); + // PTB we do a nontrivial rollback from the user daemon + } + spin_unlock(&slot->lock); + + if (!slot->file) { + result = -EBADF; + ENBD_FAIL ("Our slot has been nofiled"); + } + if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) { + result = -ENODEV; + ENBD_FAIL ("Our slot has been vamooshed"); + } + + atomic_inc (&lo->cwaiters); + atomic_set_mask(ENBD_SLOT_WAITING, &slot->flags); + + /* PTB take spinlock in order to examine queue + * we need to protect ourselves against the request fn too + */ + read_lock_irqsave (&lo->queue_lock, flags); + atomic_dec (&lo->cwaiters); + atomic_clear_mask(ENBD_SLOT_WAITING, &slot->flags); + + // PTB - now spin until request arrives to treat + while (slot->file && list_empty (&lo->queue)) { + + static int enbd_clr_sock (struct enbd_slot *slot); // forward decl + int siz; + int time_left = start_time + timeout - jiffies; + + read_unlock_irqrestore (&lo->queue_lock, flags); + + // PTB one client thread goes to sleep + atomic_inc (&lo->cwaiters); + atomic_set_mask(ENBD_SLOT_WAITING, &slot->flags); + + interruptible_sleep_on_timeout (&lo->wq, time_left); + + atomic_clear_mask(ENBD_SLOT_WAITING, &slot->flags); + // PTB one client thread reactivates + atomic_dec (&lo->cwaiters); + atomic_inc (&count); + + // PTB Have to take the spinlock again to check at the queue + atomic_inc (&lo->cwaiters); + atomic_set_mask(ENBD_SLOT_WAITING, &slot->flags); + // PTB we need to protect ourselves against the request fn too + read_lock_irqsave (&lo->queue_lock, flags); + atomic_dec (&lo->cwaiters); + atomic_clear_mask(ENBD_SLOT_WAITING, &slot->flags); + + // PTB fail for recheck if we are inactive too long + + time_left = start_time + timeout - jiffies; + if (time_left > 0 || !list_empty (&lo->queue)) + continue; + + // PTB bad. timeout with nothing on queue. Error out. + result = -ETIME; + + // PTB we will exit with fail, so up spinlock now + read_unlock_irqrestore (&lo->queue_lock, flags); + + siz = lo->blksize + sizeof (struct enbd_request); + // PTB verify the buffer is still OK - holds one block + if (access_ok(VERIFY_WRITE,slot->buffer,siz)) + goto error_out; + + // PTB buffer is invalid + result = -EINVAL; + + // PTB clr_sock takes both the io lock and the spinlock + enbd_clr_sock (slot); + ENBD_FAIL ("Our process has died or lost its buffer"); + + /* + * PTB we may do a rollback from the user daemon here + * but it'll be trivial - without effect - as we don't + * have a request in our slot to treat. + */ + goto error_out; + + } // end while loop + + // PTB we still have the (read) spinlock here + + if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) { + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -ENODEV; + ENBD_FAIL ("Our slot vaporized while we slept!"); + } + if (!slot->file) { + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -EBADF; + ENBD_FAIL ("Our slot nofiled itself while we slept!"); + } + + spin_lock(&slot->lock); + if (!list_empty (&slot->queue)) { + spin_unlock(&slot->lock); + read_unlock_irqrestore (&lo->queue_lock, flags); + result = -EINVAL; + ENBD_FAIL ("impossible! already treating one request"); + // PTB we do a nontrivial rollback from the user daemon + } + spin_unlock(&slot->lock); + + // PTB now relinquish the read lock and try for the write lock + read_unlock_irqrestore (&lo->queue_lock, flags); + + write_lock_irqsave (&lo->queue_lock, flags); + // PTB got the write lock + + if (list_empty (&lo->queue)) { + write_unlock_irqrestore (&lo->queue_lock, flags); + // PTB - somebody else did it while we waited on spinlock. OK + result = -EINVAL; + ENBD_FAIL ("ho hum beaten to the punch"); + // PTB we may do a trivial rollback from the user daemon + } + + // PTB cli/sti here looks unnec. hardware interrupts return here + // AMARIN begin uninterruptible code + + // PTB we have the (write) spinlock + + // PTB oldest=last element in queue + req = list_tail (&lo->queue, struct request, queuelist); + + // PTB this is where we free the req from our queue. We need to be + // holding our spinlock at this point + + // PTB - must succeed as have the spinlock + result = enbd_remove (lo, req); + // PTB now holding irqs off in enbd_remove + + // AMARIN end uninterruptable code + // PTB uh - maybe cli/sti is needed? interrupts can muck the queue? + // - Nah! I have left them enabled so we can see any errors. + + write_unlock_irqrestore (&lo->queue_lock, flags); + + request.magic = ENBD_REQUEST_MAGIC; + request.flags = 0; + + switch (rq_type (req)) { + + unsigned long cmd; + unsigned long arg; + size_t size; + struct enbd_ioctl_info * ioctl_info; + + case IOCTL: + + request.type = IOCTL; + + // PTB this is our special ioctl kernel request + + ioctl_info = req->special; + + if (!ioctl_info) { + result = -EINVAL; + ENBD_ALERT ("corrupt ioctl req %p with no ctl data", + req); + ENBD_FAIL ("corrupt ioctl req with no ctl data"); + } + + cmd = ioctl_info->cmd; + size = ioctl_info->size; + arg = ioctl_info->arg; + + + request.len = 0; + // PTB we are in get_req, transferring stored ioctl + if ((_IOC_DIR (cmd) & _IOC_READ) && size > 0) { + // PTB if len is +ve we copy to the user buffer later + request.len = size; + } + // PTB we store the weirded ioctl id. + // PTB Yes, this composition is our private invention. + request.from = (((__u64) cmd) << 32) + // PTB really want this to go to a 64 bit request.special + | ((__u64) (unsigned long) arg); + break; + + case READ: + case WRITE: + + request.type = rq_data_dir (req); + request.from = req->sector; + request.from <<= 9; + request.len = req->nr_sectors; + request.len <<= 9; + if (atomic_read (&lo->flags) & ENBD_MD5SUM) { + // PTB set the please do md5sum flag on the request + request.flags |= ENBD_REQUEST_MD5SUM; + } + break; + + case MD5SUM: + break; + + default: + ENBD_ALERT ("received unknown req %p type %#x\n", + req, rq_type (req)); + break; + } + + request.seqno = seqno_out->calc(seqno_out, rq_seqno (req)); + + /* + * PTB we should here erase the extra seqno info in the request + * so that on error or on ack the kernel can use the right internal + * array, but I'll erase it in the ack function instead + */ + + do_gettimeofday (&time); + request.time = time.tv_sec; + request.time *= 1000000; + request.time += time.tv_usec; + request.zone = sys_tz.tz_minuteswest; + + // PTB tz_dsttime = 0 always in linux + + memcpy (&request.handle, &req, sizeof (request.handle)); + + if (copy_to_user (slot->buffer, (char *) &request, sizeof (request))){ + result = -EFAULT; + ENBD_FAIL ("Copy request to userspace buffer failed"); + } + + switch (request.type) { + + int err; + char * arg; + + case READ: + break; + + case IOCTL: + if (request.len <= 0) + break; // PTB presumably nothing to do + arg = (char *) slot->buffer + ENBD_BUFFER_DATA_OFFSET; + if (copy_to_user (arg, req->buffer, request.len)) { + result = -EFAULT; + ENBD_FAIL ("Copy request to userspace buffer failed"); + } + break; + + case WRITE: + arg = (char *) slot->buffer + ENBD_BUFFER_DATA_OFFSET; + err = copy_to_user_from_req (req, arg, request.len); + if (err >= request.len) + break; // OK + // PTB buffer had missing BHSs + ENBD_ERROR ("req %p offered %d bytes of %d for copy to user\n", + req, result, request.len); + // PTB this request is badly damaged. We had better shoot it. + if (req) { + if (req->errors >= 0) + req->errors++; + enbd_end_request_lock (req); + blk_put_request (req); + } + ENBD_FAIL ("kernel failed to keep req while we copied from it"); + break; + case MD5SUM: + break; + default: + ENBD_ERROR ("req %p was type %#x\n", req, rq_type(req)); + ENBD_FAIL ("unknown req type"); + break; + } + + /* + * PTB enbd_accept does not take spinlock and does not need to as + * the req is already free of the shared queue and only needs + * to be placed on the unique slot queue. + */ + + enbd_accept (slot, req); + + atomic_dec (&lo->cthreads); // PTB - client thread leaves normally + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + + return 0; + + error_out: + // PTB accounting - a fail to get a request is not an errored request + atomic_dec (&lo->cthreads); // PTB - client thread leaves abnormally + atomic_clear_mask(ENBD_SLOT_RUNNING, &slot->flags); + result = result < 0 ? result : -ENODEV; + + return result; +} + +/* + * PTB error out the pending requests on the kernel queue + * We have to be called WITHOUT the io request lock held. + * We sleep imbetween clearing each request, for "safety". + * + * @lo the nbd device to scan + */ +static int +enbd_clr_kernel_queue (struct enbd_device *lo) +{ + + int count = 0; + unsigned long flags; + request_queue_t *q = lo->q; + + spin_lock_irqsave (q->queue_lock, flags); + + while (! elv_queue_empty(q) && count++ < 1000) { + struct request *req; + req = elv_next_request(q); + if (!req) { // PTB impossible + spin_unlock_irqrestore (q->queue_lock, flags); + ENBD_ALERT + ("impossible! kernel queue empty after tested nonemty!\n"); + goto fail; + } + blkdev_dequeue_request (req); + spin_unlock_irqrestore (q->queue_lock, flags); + if (req->errors >= 0) + req->errors++; + current->state = TASK_INTERRUPTIBLE; + schedule_timeout (1); + enbd_end_request_lock (req); + blk_put_request (req); + spin_lock_irqsave (q->queue_lock, flags); + } + spin_unlock_irqrestore (q->queue_lock, flags); + goto success; + + fail: + /* PTB fall thru */ + success: + ENBD_ALERT ("removed %d requests\n", count); + return count; + +} + +/* + * PTB error out the pending requests on the nbd queue and kernel queue + * Note that we take the queue spinlock for this + * + * @lo the nbd device to scan + */ +static int +enbd_clr_queue (struct enbd_device *lo) +{ + int count = 0; + + while (count < 1000) { + + struct request *req; + unsigned long req_blks = 1; + long cmd; + + // PTB cannot allow new requests via interrupts + write_lock (&lo->queue_lock); + if (list_empty (&lo->queue)) { + write_unlock(&lo->queue_lock); + break; + } + req = list_head (&lo->queue, struct request, queuelist); + if (!req) { + write_unlock(&lo->queue_lock); + break; + } + + req_blks = rq_nr_blks (req); + + if (req->errors >= 0) + req->errors += req_blks + 1; + atomic_add (req_blks, &lo->requests_err); + + /* PTB - must succeed as have the spinlock */ + enbd_remove (lo, req); + write_unlock(&lo->queue_lock); + count++; + + cmd = rq_type(req); + + enbd_end_request_lock (req); + + switch(cmd) { + case IOCTL: + // PTB leave it to the ioctl waiter to free it + ENBD_ALERT("clearing ioctl req %p\n", req); + // PTB the commit or error put the request + break; + default: + blk_put_request (req); + break; + } + + } + ENBD_ALERT ("unqueued %d reqs\n", count); + return count; +} + + + +#ifndef NO_BUFFERED_WRITES + /* + * Magic function from rd.c that we hope saves a buffer head + * permanently somewhere in the kernel VM system. + */ +static int +buffered_write_pagecache_IO (struct buffer_head *sbh, int nbd) +{ + struct address_space *mapping; + unsigned long index; + int offset, size, err; + struct enbd_device *lo = &enbd_dev[nbd]; + err = 0; + + // PTB we need to save the /dev/nda inode + if (!lo->inode) { + err = -ENODEV; + goto out; + } + mapping = lo->inode->i_mapping; + + // PTB index appears to be the page number + index = sbh->b_rsector >> (PAGE_CACHE_SHIFT - 9); + // PTB offset is in bytes, and says where in the page the sector starts + offset = (sbh->b_rsector << 9) & ~PAGE_CACHE_MASK; + // PTB well, an abbreviation for the buffer size, in bytes + size = sbh->b_size; + + do { + // PTB we mark each page that we should write to Uptodate + + int count; + struct page **hash; + struct page *page; + char *src, *dst; + + int unlock = 0; + + // PTB ummm, how much of the page is left to traverse + count = PAGE_CACHE_SIZE - offset; + // PTB reduce it to how much we actually need to traverse + if (count > size) + count = size; + // PTB say NOW? that we have traversed what we want of the page + size -= count; + + hash = page_hash (mapping, index); + page = __find_get_page (mapping, index, hash); + + if (!page) { + // PTB we get to make a new page + page = grab_cache_page (mapping, index); + if (!page) { + // PTB failed to get new page + err = -ENOMEM; + goto out; + } + // PTB magic + if (!Page_Uptodate (page)) { + memset (kmap (page), 0, PAGE_CACHE_SIZE); + kunmap (page); + SetPageUptodate (page); + } + // PTB the new page is locked. We need to unlock it later + unlock = 1; + } + + // PTB prepare already for next page + index++; + + // PTB set up for copy + dst = kmap (page); + dst += offset; + src = bh_kmap (sbh); + + // PTB prepare for next round + offset = 0; + + // PTB do a copy + memcpy (dst, src, count); + + kunmap (page); + bh_kunmap (sbh); + + if (unlock) { + UnlockPage (page); + } + SetPageDirty (page); + __free_page (page); + + } while (size > 0); + + out: + return err; + +} +static int +buffered_write (struct request *req) +{ + + struct buffer_head *bh; + int dev = MINOR (req->rq_dev); + int nbd = dev >> ENBD_SHIFT; + int err = 0; + + // PTB go through and copy and protect the written buffers + for (bh = req->bh; bh; bh = bh->b_reqnext) { + struct buffer_head *rbh; + rbh = + getblk (bh->b_rdev, bh->b_rsector / (bh->b_size >> 9), + bh->b_size); + if (bh != rbh) { + char *bdata = bh_kmap (bh); + memcpy (rbh->b_data, bdata, rbh->b_size); + ENBD_ALERT ("got new bh sector %lu on write\n", + bh->b_rsector); + } + bh_kunmap (bh); + mark_buffer_protected (rbh); // PTB equals dirty, uptodate + err = buffered_write_pagecache_IO (bh, nbd); + if (err < 0) { + break; + } + brelse (rbh); + } + return err; +} + +#endif /* NO_BUFFERED_WRITES */ + +/* + * PTB check if the device is read only according to int flags + * + * @lo the nbd device to be checked + */ +static int +enbd_read_only(struct enbd_device *lo) { + return (atomic_read(&lo->flags) & ENBD_READ_ONLY) != 0; +} +/* + * PTB set the device readonly (or not) + * + * @lo the nbd device to be set up + * @ro 1 for read only, 0 for read write. + */ +static void +enbd_set_read_only(struct enbd_device * lo, int ro) { + + if (ro != 0) { + atomic_set_mask (ENBD_READ_ONLY, &lo->flags); + } else { + atomic_clear_mask (ENBD_READ_ONLY, &lo->flags); + } + + // PTB which device really does not matter. We do the checking. + set_disk_ro (lo->disk, ro != 0); +} + +#undef ENBD_FAIL +#define ENBD_FAIL( s... ) { \ + if (fails++ <= 0) { \ + ENBD_ERROR( s); printk("\n"); \ + }\ + goto error_out; \ +} + +/* + * PTB - kernel function to take reqs off the kernel queue. Runs with + * io lock held. This is the "request function". + */ +static void +do_enbd_request (request_queue_t * q) +{ + struct request *req; + unsigned long flags; + + while ((req = elv_next_request(q)) != NULL) { + + struct enbd_device *lo = rq_get_enbd(req); + static int fails; + + blkdev_dequeue_request (req); + + if (!lo) { + if (!req->rq_disk) { + ENBD_FAIL ("request with NULL disk field received!"); + } + ENBD_FAIL ("request with NULL private data field in disk received!"); + } + + if (!(req->flags & REQ_CMD)) { + ENBD_FAIL ("request without CMD flag received!"); + } + + /* PTB - one kernel thread enters */ + atomic_inc (&lo->kthreads); + + if (atomic_read (&lo->kthreads) > atomic_read (&lo->kmax)) + atomic_set (&lo->kmax, atomic_read (&lo->kthreads)); + + if (!lo->inode || !lo->file) { + ENBD_FAIL ("Request when device not ready."); + } + + if (rq_data_dir (req) == WRITE && enbd_read_only(lo)) { + ENBD_FAIL ("write on read-only device"); + } + flags = atomic_read (&lo->flags); + if (!(flags & ENBD_INITIALISED)) { + ENBD_FAIL ("device not initialised."); + } + if (!(flags & ENBD_ENABLED)) { + ENBD_FAIL ("device not enabled."); + } + if (flags & ENBD_REMOTE_INVALID) { + ENBD_FAIL ("remote device invalid."); + } + if (req->sector + req->nr_sectors > lo->sectors) { + ENBD_FAIL ("overrange request"); + } + if (req->sector < 0) { + ENBD_FAIL ("underrange request"); + } + if (req->rq_disk->major != major) { + ENBD_FAIL ("request for wrong major"); + } + req->errors = 0; + + // PTB in 2.5 we can release the iolock briefly here + spin_unlock_irq(q->queue_lock); + + // PTB we are the only reader and writer of lo->seqno + if (rq_type (req) == WRITE && rq_seqno (req) == 0) { + // PTB it is a new request never seen before + struct enbd_seqno * seqno_out = &lo->seqno_out; + seqno_out->inc(seqno_out); + /* + * PTB only WRITE requests, and they are NOT SPECIALS + */ + rq_set_seqno (req, seqno_out->get(seqno_out)); + } + + // PTB normal sequence is to queue request locally + enbd_enqueue (lo, req); + goto accounting; + + accounting: + atomic_dec (&lo->kthreads); + // PTB regain the iolock for another turn + spin_lock_irq(q->queue_lock); + fails = 0; // PTB good request. Restart counter + continue; // PTB next request + + error_out: + // PTB can rely on req being nonnull here + if (req->errors >=0) + req->errors++; + + enbd_end_request (req); + + // PTB in 2.5 we can release the iolock briefly here + spin_unlock_irq(q->queue_lock); + + blk_put_request (req); + + // PTB more accounting + if (lo) { + int req_blks = rq_nr_blks (req); + atomic_add (req_blks, &lo->requests_err); + atomic_dec (&lo->kthreads); + } else { + ENBD_ALERT("failed to account orphan errored req %p\n", + req); + } + // PTB regain the iolock for another turn + spin_lock_irq(q->queue_lock); + continue; + } + return; +} + +#ifndef HOT_ADD_DISK + #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#endif +#ifndef SET_DISK_FAULTY + #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#endif + +/* + * PTB check all parts for inclusion in a MD device and notify the MD + * device we're OK (or not) again when appropriate. + */ +static int +enbd_notify_md_devices (struct enbd_device *lo, int cmd) +{ + int j; + struct enbd_md *md = &enbd_md; + + for (j = 0; j - 1 < lo->nslot; j++) { + dev_t enbd_dev = MKDEV (major, j + (lo->nbd << ENBD_SHIFT)); + if (j == 0) { + if (atomic_read (&lo->md_count) <= 0) + continue; + } else { + struct enbd_slot *slot = &lo->slots[j - 1]; + if (slot->md_count <= 0) + continue; + } + md->notify(&enbd_md, enbd_dev, cmd); + } + return 0; +} + + +/* + * PTB - set the enabled flag on a device (call without the spinlock held) + * + * @lo the nbd device being treated + */ +static void +enbd_enable (struct enbd_device *lo) { + unsigned long flags; + int did_enabled = 0; + + // PTB reenable part + write_lock_irqsave (&lo->meta_lock, flags); + if (!atomic_test_and_set_mask (&lo->flags, ENBD_ENABLED)) { + // PTB was not enabled before + atomic_clear_mask (ENBD_VALIDATED, &lo->flags); + lo->lives++; + did_enabled = 1; + } + write_unlock_irqrestore (&lo->meta_lock, flags); + + if (did_enabled) { + ENBD_ALERT("set VALID on nd%s\n", lo->devnam); + //__invalidate_device(lo->inode->i_bdev, 0); + enbd_notify_md_devices(lo, HOT_ADD_DISK); + } +} + + +/* + * PTB rollback all requests on a given slot and then invalidate it + * (so the requests can't go back until somebody reactivates the slot) + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. Soft_reset + * (which we call) also calls rollback, so has the same problem. + * + * @slot the nbd slot being treated + */ +static int +enbd_clr_sock (struct enbd_slot *slot) +{ + int i = 0; + struct enbd_device *lo = slot->lo; + int islot = slot->i; + unsigned long flags; + int do_reset = 0; + int do_enable = 0; + static int enbd_soft_reset (struct enbd_device*); + + enbd_rollback_all (slot); + + slot->file = NULL; + slot->bufsiz = 0; + atomic_set(&slot->flags, 0); + slot->buffer = NULL; + + write_lock_irqsave (&lo->meta_lock, flags); + + /* PTB reset lo->aslot */ + + if (lo->aslot > 0) { + + /* PTB grr .. do this the hard way */ + int aslot = 0; + for (i = 0; i < lo->nslot; i++) { + struct enbd_slot *sloti = &lo->slots[i]; + if (sloti->file) + aslot++; + } + lo->aslot = aslot; + + if (lo->aslot <= 0) { + // PTB we were the last client alive, diasable device + if (atomic_read (&lo->flags) & ENBD_SHOW_ERRS) { + // PTB soft_reset will invalidate_buffers + atomic_clear_mask (ENBD_ENABLED, &lo->flags); + do_reset = 1; + } + } else if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) { + // PTB must not call reenable as that clears the queue + do_enable = 1; + } + + } + + // PTB lift the lock temporarily + write_unlock_irqrestore(&lo->meta_lock, flags); + if (do_reset) { + enbd_soft_reset (lo); + } + if (do_enable) { + enbd_enable (lo); + ENBD_ALERT ("enabled device nd%s\n", lo->devnam); + } + write_lock_irqsave(&lo->meta_lock, flags); + + /* PTB reset lo->islot, for no good reason */ + + if (atomic_read (&lo->islot) == islot) { + for (i = 0; i++ < lo->nslot;) { + atomic_inc (&lo->islot); + if (atomic_read (&lo->islot) >= lo->nslot) + atomic_set (&lo->islot, 0); + if (lo->slots[atomic_read (&lo->islot)].file) + break; + } + } + lo->harderror = 0; + write_unlock_irqrestore (&lo->meta_lock, flags); + + /* PTB don't clear whole device queue as we might still be open */ + + return 0; +} + +/* + * PTB - check all slots for old requests and roll them back. + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. + * + * @lo the nbd device to scan + */ +static void +enbd_rollback_old (struct enbd_device *lo) +{ + + int islot; + + for (islot = 0; islot < lo->nslot; islot++) { + struct enbd_slot *slot = &lo->slots[islot]; + if (slot->req_age > 0 + && slot->req_age < jiffies - lo->req_timeo * HZ) { + enbd_rollback_all (slot); + } + } + +} + +/* + * PTB - get a semaphore within a certain number of jiffies. + * Return 0 for success, otherwise -ETIME; + */ +static int +down_timeout(struct semaphore *sem, typeof(jiffies) timeout) { + + typeof(jiffies) start_jiffies = jiffies; + while (down_trylock(sem) != 0) { + if (jiffies > start_jiffies + timeout) { + return -ETIME; + } + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + return 0; +} + + +/* + * PTB - register a socket to a slot. + * - Return 0 for success and -ve for failure. + * Nowadays this doesn't do very much! Just finalizes things. + * + * @slot the nbd slot being registered + */ +static int +enbd_set_sock (struct enbd_slot *slot, int arg) +{ + + struct enbd_device *lo = slot->lo; + int islot = slot->i; + unsigned long flags; + int do_enable = 0; + + if (!(atomic_read (&lo->flags) & ENBD_INITIALISED)) { + ENBD_ALERT ("(%d) device nd%s not initialised yet!\n", + islot, lo->devnam); + return -ENODEV; + } + if (!(atomic_read (&lo->flags) & ENBD_SIZED)) { + ENBD_ALERT ("(%d) device nd%s not sized yet!\n", islot, + lo->devnam); + return -EINVAL; + } + if (!(atomic_read (&lo->flags) & ENBD_BLKSIZED)) { + ENBD_ALERT ("(%d) device nd%s not blksized yet!\n", islot, + lo->devnam); + return -EINVAL; + } + if (!(atomic_read (&lo->flags) & ENBD_SIGNED)) { + ENBD_ALERT ("(%d) setting unsigned device nd%s! But harmless.\n", + islot, lo->devnam); + return -EINVAL; + } + + if (down_timeout(&lo->pid_sem, HZ) < 0) { + ENBD_ALERT + ("(%d) pid %d took too long to get nd%s%d pid sem!\n", + islot, slot->pid, lo->devnam, islot + 1); + return -ETIME; + } + + if (slot->pid != current->pid) { + if (jiffies > slot->cli_age + 2 * HZ * lo->req_timeo) { + ENBD_ALERT + ("(%d) dead client process %d has nd%s%d, erasing pid!\n", + islot, slot->pid, lo->devnam, islot + 1); + slot->pid = 0; + } else { + ENBD_ALERT + ("(%d) other live client process %d has nd%s%d!\n", + islot, slot->pid, lo->devnam, islot + 1); + } + up (&lo->pid_sem); + return -EINVAL; + } + up (&lo->pid_sem); + + slot = &lo->slots[islot]; + + // PTB this is a queue critical code region for the flags business + write_lock_irqsave (&lo->meta_lock, flags); + + // PTB file has to be nonzero to indicate we are all set up. + slot->file = (void *) (unsigned long) (arg+1 > 0 ? arg+1 : 1); + + if (islot >= lo->nslot) { + lo->nslot = islot + 1; + ENBD_INFO ("increased socket count to %d\n", lo->nslot); + } + + lo->harderror = 0; + + if (lo->disk && !get_capacity(lo->disk)) { + set_capacity(lo->disk, lo->sectors); + } + if (++lo->aslot > 0) { + do_enable = 1; + } + // PTB end of queue critical region + write_unlock_irqrestore (&lo->meta_lock, flags); + + /* + * PTB if this is the first slot, we might call reenable and + * thus clr queue too, but reenable takes the spinlock + */ + if (do_enable) + enbd_enable(lo); + + return 0; +} + +/* + * PTB - return the index i of 2^i + j, 0 <= j < 2^i + */ +static inline unsigned +mylog2 (unsigned arg) +{ + unsigned log = 0; + while ((arg >>= 1) > 0) + log++; + return log; +} + +/* + * PTB - set the blksize in bytes of the block device. Return 0 for + * - success and -ve for failure. + */ +static int +enbd_set_blksize (struct enbd_device *lo, unsigned int arg) +{ + int nbd = lo->nbd; + if (arg > PAGE_SIZE || arg < 512 || (arg & (arg - 1))) { + ENBD_ERROR ("blksize too big (%u)\n", arg); + return -EINVAL; + } + lo->blksize = enbd_blksizes[nbd << ENBD_SHIFT] = arg; + lo->logblksize = mylog2 (lo->blksize); + set_blocksize(lo->inode->i_bdev, lo->blksize); + atomic_set_mask (ENBD_BLKSIZED, &lo->flags); + return 0; +} + +/* + * PTB - set the size in bytes of the block device. Return 0 for + * - success and -ve for failure. + */ +static int +enbd_set_size (struct enbd_device *lo, __u64 arg) +{ + int nbd = lo->nbd; + lo->bytesize = enbd_bytesizes[nbd << ENBD_SHIFT] = arg; + lo->size = enbd_sizes[nbd << ENBD_SHIFT] = arg >> 10; + lo->sectors = lo->size << 1; + if (lo->inode && lo->inode->i_bdev && lo->inode->i_bdev->bd_inode) + lo->inode->i_bdev->bd_inode->i_size = arg; + if (lo->disk) + set_capacity (lo->disk, arg >> 9); + atomic_set_mask (ENBD_SIZED, &lo->flags); + return 0; +} + +/* WG */ +static int +enbd_set_intvl (struct enbd_device *lo, int arg) +{ + if (arg <= 0) { + ENBD_ERROR ("bad pulse interval/req timeout value (%d)\n", arg); + return -EINVAL; + } + lo->req_timeo = arg; + return 0; +} + +static int +enbd_set_spid (struct enbd_slot *slot, int arg) +{ + short spid = arg; + if (arg < 0 || arg >= (1 << (sizeof (short) * 8))) { + ENBD_ERROR ("bad spid value (%d)\n", arg); + return -EINVAL; + } + slot->spid = spid; + return 0; +} + +static int +enbd_set_bufferwr (struct enbd_device *lo, int arg) +{ + if (arg) { + atomic_set_mask (ENBD_BUFFERWR, &lo->flags); + } else { + atomic_clear_mask (ENBD_BUFFERWR, &lo->flags); + } + return 0; +} + +static int +enbd_set_remote_invalid (struct enbd_device *lo, int arg) +{ + /* + * PTB we handle the nvalidate event ourself exactly when it happens + * instead of letting the kernel have check_media defined + * and doing it there (and then reporting 0 to the kernel) + */ + unsigned long flags; + int do_invalidate = 0; + + if (arg == 0) { + write_lock_irqsave (&lo->meta_lock, flags); + if (atomic_test_and_clear_mask(&lo->flags, ENBD_REMOTE_INVALID)){ + if (atomic_test_and_clear_mask (&lo->flags, + ENBD_SET_SHOW_ERRS)) { + atomic_clear_mask (ENBD_SHOW_ERRS, &lo->flags); + } + } + write_unlock_irqrestore (&lo->meta_lock, flags); + return 0; + } + + write_lock_irqsave (&lo->meta_lock, flags); + if (!(atomic_test_and_set_mask (&lo->flags, ENBD_REMOTE_INVALID))) { + /* + * PTB this tells the kernel that next open + * should cause recheck .. we'll agree not to + * say we're happy until VALID is set again + */ + atomic_clear_mask (ENBD_VALIDATED, &lo->flags); + // PTB also make errors real until device validated again + if (!atomic_test_and_set_mask (&lo->flags, ENBD_SHOW_ERRS)) { + atomic_set_mask (ENBD_SET_SHOW_ERRS, &lo->flags); + } + // PTB test removing partitions + do_invalidate = 1; + } + write_unlock_irqrestore (&lo->meta_lock, flags); + + if (do_invalidate) { + // PTB destroy buffers + //struct inode *inode = lo->inode; + //struct block_device *bdev = inode->i_bdev; + + ENBD_ALERT("INVALIDATE drive on nd%s\n", lo->devnam); + // PTB - clear buffers now instead of waiting for kernel + //invalidate_bdev(bdev, 1); + + // PTB will be done by check_disk_changed whoch has + // called us! + //__invalidate_device(bdev, 0); + + // PTB will cause requests to start being errored + //invalidate_partition(lo->disk, 0); + } + + return 0; +} + + +/* + * Set the signature on the whole device, if not yet set. + * + * If it is set, compare. + */ +static int +enbd_set_sig(struct enbd_device *lo, int* buf) { + + if (!atomic_test_and_set_mask (&lo->flags, ENBD_SIGNED)) { + /* PTB first time grab sig */ + memcpy ((char *) lo->signature, buf, ENBD_SIGLEN); + return 1; + } + + /* PTB test for equality */ + if (memcmp(buf, (char*)&lo->signature[0], ENBD_SIGLEN) != 0) { + return -EINVAL; + } + return 0; +} + +/* + * Return the first slot index free when asking for n new ones. + * If there s no such gap, then ENBD_MAXCONN will be returned. + * The return is always in the same argument address. + */ +static int +enbd_get_nport (struct enbd_device *lo, int *arg) +{ + int nslot, i; + + if (arg == NULL) { + return -EINVAL; + } + + nslot = *arg; + if (copy_from_user ((char *) &nslot, arg, sizeof (nslot))) { + ENBD_ERROR("could not copy number of ports from user\n"); + return -EFAULT; + } + + for (i = 0; i < ENBD_MAXCONN; i++) { + struct enbd_slot *sloti = &lo->slots[i]; + int j; + if (sloti->file) { + continue; + } + + for (j = i; j < ENBD_MAXCONN && j < i + nslot; j++) { + if (sloti->file) + break; + } + if (j == i + nslot) { + + break; + } + } + + if (copy_to_user (arg, (char *) &i, sizeof (int))) { + return -EFAULT; + } + return 0; +} + +/* + * Set the pid on the slot if unset, and set time, and then return 0. + * + * If the pid is already set, compare and if it's ours, return 0. + * If the pid differs return -ve. If they hasn't touched us in a long + * while (the time is set on many ops), erase it too! + */ +static int +enbd_set_pid (struct enbd_slot *slot) +{ + struct enbd_device *lo = slot->lo; + int islot = slot->i; + long timeout = lo->req_timeo > 0 ? 2 * lo->req_timeo * HZ: 60 * HZ; + + // PTB set pid if first time + if (slot->pid == 0) { + slot->pid = current->pid; + slot->cli_age = jiffies; + return 0; + } + + if (slot->pid == current->pid) + return 0; + + if (jiffies > slot->cli_age + timeout) { + ENBD_ALERT + ("(%d): dead process %d was setting sig, erasing pid\n", + islot, slot->pid); + slot->pid = current->pid; + slot->cli_age = jiffies; + return 0; + } + + ENBD_ALERT ("(%d): live process %d is trying to set sig\n", + islot, slot->pid); + return -EINVAL; +} + +/* + * PTB - if we're not signed, accept new sig and return success. + * - if we are signed, compare the offer and return success if equal, + * - and -ve for failure. + * + * @slot the slot we're working on + * @sig the string of signature chars (accessed as int *) + */ +static int +my_nbd_set_sig (struct enbd_slot *slot, int *sig) +{ + int err = 0; + int buf[ENBD_SIGLEN / sizeof (int)]; + int islot = slot->i; + struct enbd_device *lo = slot->lo; + + if (!access_ok (VERIFY_READ, (char *) sig, ENBD_SIGLEN)) { + ENBD_ALERT ("(%d): failed sigcheck with bad user address %p\n", + islot, sig); + err = -EINVAL; + return err; + } + + // PTB semaphore not spinlock because copy_from_user might sleep? + if (down_timeout(&lo->pid_sem, HZ) < 0) { + ENBD_ALERT + ("(%d) pid %d took too long to get nd%s%d pid sem!\n", + islot, slot->pid, lo->devnam, islot + 1); + return -ETIME; + } + + if (enbd_set_pid(slot) < 0) { + up(&lo->pid_sem); + return -EINVAL; + } + // PTB slot->pid is now set and matches ours + + err = copy_from_user ((char *) buf, (char *) &sig[0], ENBD_SIGLEN); + if (err > 0) { + up(&lo->pid_sem); + ENBD_ERROR("could not copy signature from user\n"); + return -EFAULT; + } + err = enbd_set_sig(lo, (int *)buf); + + if (err < 0) { + // PTB vamoosh our pid registration too + slot->pid = 0; + ENBD_ALERT ("(%d): failed sigcheck wth %d\n", islot, err); + } + up (&lo->pid_sem); + return err; +} + +/* + * PTB - register a userspace buffer to a slot. Return 0 for success + * - and -ve for failure. Null arg acts as erase. + */ +static int +enbd_reg_buf (struct enbd_slot *slot, char *buffer) +{ + + int err = 0, siz; + struct enbd_device *lo = slot->lo; + + if (!buffer) { + atomic_clear_mask(ENBD_SLOT_BUFFERED, &slot->flags); + slot->buffer = NULL; + slot->bufsiz = 0; + return 0; + } + + siz = lo->max_sectors << 9; + + /* verify the buffer is in the process space */ + if (!access_ok (VERIFY_WRITE, buffer, siz)) { + err = -EINVAL; + return err; + } + /* PTB hope the buffer is as big as it should be - FIXME */ + slot->buffer = buffer; + slot->bufsiz = siz; + + /* PTB let the device bufsiz be min of registered nonzero bufsizes */ + if (!lo->bufsiz) { + // PTB first time + lo->bufsiz = siz; + } else { + if (lo->bufsiz > siz) + lo->bufsiz = siz; + } + + // PTB just in case the buffer really is small, we reset all the + // kernels request maxima if we have to adjust the device max + if (lo->max_sectors < (lo->bufsiz >> 9)) { + int j; + lo->max_sectors = lo->bufsiz >> 9; + for (j = 0; j < ENBD_MAXCONN; j++) { + enbd_max_sectors[(lo->nbd << ENBD_SHIFT) + j] = + lo->max_sectors; + } + } + + atomic_set_mask(ENBD_SLOT_BUFFERED, &slot->flags); + return 0; +} + +/* + * PTB - this unsets the enabled flag on the device and then clears the + * - queue for the device.. Call without spinlock. + * + * @lo the nbd device to scan + */ +static int +enbd_disable (struct enbd_device *lo) +{ + if (!lo || !(atomic_read (&lo->flags) & ENBD_INITIALISED)) { + ENBD_ALERT("enbd_disable called on bad device\n"); + return 0; + } + + if (atomic_test_and_clear_mask (&lo->flags, ENBD_ENABLED)) { + ENBD_ALERT ("disabled device nd%s\n", lo->devnam); + } + + enbd_notify_md_devices(lo, SET_DISK_FAULTY); + + // PTB have to recheck partitions on next open + if (atomic_test_and_clear_mask (&lo->flags, ENBD_VALIDATED)) { + ENBD_ALERT ("INVALIDATEd device nd%s\n", lo->devnam); + //__invalidate_device(lo->inode->i_bdev, 0); + } + return 0; +} + + +/* + * PTB - reset the enabled flag on a device and then clear all queues + * ( call without the spinlock held ) and then enable again. + */ +static void +enbd_reenable (struct enbd_device *lo) +{ + + int m, n; + + if (!(atomic_read (&lo->flags) & ENBD_INITIALISED)) + return; + if (lo->aslot <= 0) + return; + if ((atomic_read (&lo->flags) & ENBD_ENABLED)) + return; + + m = enbd_clr_queue (lo); + // PTB - have to call clr_kernel_queue without the io_spinlock held + n = enbd_clr_kernel_queue (lo); + + enbd_enable(lo); +} + +/* + * This function launches a thread which wakes for a signal to reenable + * the device, and then sets the timer to deleiver the signal. + */ +static int +enbd_reenable_delay (struct enbd_device *lo, int delay) +{ + write_lock (&lo->meta_lock); + if (lo->reenable_time == 0) + lo->reenable_time = jiffies + delay * HZ; + write_unlock (&lo->meta_lock); + return 0; +} + + + +/* + * PTB - drains device queue. Disables device. + * At least rollback (which we call takes both the io spinlock and our + * spinlock, so we can hold neither when we are called. Also + * invalidate buffers, on request of Rogier Wolff. + */ +static int +enbd_soft_reset (struct enbd_device *lo) +{ + int j; + const int max_clrq_retries = 100; + if (!(atomic_read (&lo->flags) & ENBD_INITIALISED) || lo->nslot <= 0) { + return -EINVAL; + } + /* + * PTB We push back the requests in the slot, in order to be able to + * vamoosh them in a moment. This is a race, surely? We ought to + * do this atomically or dsiable the slots first. + */ + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slot = &lo->slots[j]; + enbd_rollback_all (slot); + } + // PTB disable unsets the nabled flag and clears the queue + enbd_disable (lo); + for (j = 0; j < max_clrq_retries; j++) { + int m = enbd_clr_queue (lo); + if (m <= 0) + break; + } + // PTB this would unsign the device: lo->flags &= ~ENBD_SIGNED; + + /* + * PTB put back invalidate buffers for use when called from + * clr_sock from enbd_release on request of Rogier Wolff. FIXME. + */ + ENBD_ALERT("INVALIDATE DEVICE nd%s\n", lo->devnam); + __invalidate_device(lo->inode->i_bdev, 0); + + if (1) { + struct inode *inode = lo->inode; + struct block_device *bdev = inode ? inode->i_bdev : NULL; + request_queue_t * q; + if (!bdev) + return 0; + q = bdev_get_queue (bdev); + ENBD_ALERT("run enbd_request on nd%s\n", lo->devnam); + spin_lock_irq(q->queue_lock); + do_enbd_request (q); + spin_unlock_irq(q->queue_lock); + } + + return 0; +} + +/* + * PTB - added a device/module reset for tidyness in face of rampant hacking + * - this does a soft_reset of all devices, followed bu a clr sock + * - on each, and then clears the kernel queue. It unsets the + * - enabled flag on each device. + * We have to be called without either the spinlock or the + * spinlock held, as we call soft_reset which takes both, as + * does clr_sock + */ +int +enbd_hard_reset (struct enbd_device *lo) +{ + int i; + int err = 0; + + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + int j; + if (!lo->file || !lo->inode) + continue; + if (!(atomic_read(&lo->flags)&ENBD_INITIALISED)) + continue; + enbd_soft_reset (lo); + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slot = &lo->slots[j]; + // PTB this takes the io spinlock and our spinlock. + enbd_clr_sock (slot); + } + // PTB - call clr_kernel_queue without the io_spinlock held + enbd_clr_kernel_queue (lo); + } + + return err; +} + +/* + * Calls kmalloc to put a buffer in the request and reads the user + * space @ioctl_info->arg into it, setting @ioctl_info->size + * + * If we error, we kfree the buffer and return < 0, setting the req + * buffer entry to NULL. + * + * If we succeed, we return the size >= 0 + */ +static int +indirect_ioctl_load (struct enbd_ioctl_info *ioctl_info, char **buf) +{ + + /* + * PTB req->nr_sectors is at least one on return iff we kmallocéd + * and and only and then is req->buffer non null and the + * return value > 0. And we return the exact size, not the + * value rounded up to 512. + */ + + char * userbuf; + int size; + int err; + long cmd; + struct enbd_ioctl *remote_ioctl = enbd_remote_ioctl.remote; + int nr_sectors; + char * buffer; + + if (!remote_ioctl) + return -EINVAL; + + if (!ioctl_info) + return -EINVAL; + + cmd = ioctl_info->cmd; + userbuf = (char *)ioctl_info->arg; + /* + * PTB buf is only needed for special interps that need to + * trace the data. Normally the cmd info is enough + */ + size = remote_ioctl->size_user (cmd, userbuf); + + if (size < 0) { + // PTB unauthorized ioctl + return -EINVAL; + } + + if (size == 0) { + // PTB we never use the nbd devices small buffer now + ioctl_info->size = 0; + if (buf) + *buf = NULL; + return size; + } + + /* + * PTB we have to use an extra buffer or else block + * here and rendezvous directly with the get_req call + */ + nr_sectors = (size + 511) >> 9; + buffer = kmalloc(nr_sectors << 9, GFP_KERNEL); + + if (!buffer) { + return -ENOMEM; + } + + if (_IOC_DIR (cmd) & _IOC_WRITE) { + err = + remote_ioctl->cp_from_user (cmd, buffer, userbuf, size); + if (err < 0) { + kfree (buffer); + return err; + } + } + ioctl_info->size = size; + if (buf) + *buf = buffer; + return size; +} + + +static struct request * +ioctl_make_request (struct enbd_device *lo, int cmd, unsigned long arg) +{ + + struct request *req; + struct enbd_ioctl_info *ioctl_info; + unsigned long start_time = jiffies; + unsigned long timeout = lo->req_timeo * HZ; + + while (req = blk_get_request(lo->q, WRITE, GFP_ATOMIC), !req) { + if (jiffies >= start_time + timeout) { + // PTB it takes too long + ENBD_ALERT + ("took too long to get a spare ioctl req: TIMEOUT\n"); + return NULL; + } + interruptible_sleep_on_timeout (&lo->req_wq, + start_time + timeout - jiffies); + } + + set_rq_type(req, IOCTL); + + req->errors = 0; + // PTB the zeroing is done by get_request in ll_rw_lk, but still .. + req->bio = req->biotail = NULL; + req->data = NULL; + req->data_len = 0; + req->timeout = (start_time + timeout - jiffies); + if (!req->timeout) + req->timeout = 60 * HZ; + + // PTB failsafe + req->nr_sectors = 0; + req->current_nr_sectors = 0; + req->hard_cur_sectors = 0; + req->buffer = NULL; + + req->flags |= REQ_NOMERGE; + + ioctl_info = kmalloc(sizeof(*ioctl_info), GFP_KERNEL); + + if (!ioctl_info) { + blk_put_request(req); + return NULL; + } + + ioctl_info->cmd = cmd; + ioctl_info->arg = arg; + + req->special = (void *) ioctl_info; + + /* + * PTB this is (arg if it is direct, else) the address of a local buffer + * PTB we need to store the arg or its dereference somewhere local + * for a while until the cnb-client thread can enter and pick it + * up. The alternative is to block the ioctl here until it is + * picked up, which IS possible. + */ + + if (_IOC_DIR (cmd) & _IOC_READ) { + // PTB indirect - makes req->buffer + int err = indirect_ioctl_load (ioctl_info, &req->buffer); + if (err < 0) { + // PTB we don't have to kill the buffer - already done + req->special = NULL; + req->data = NULL; + req->data_len = 0; + req->buffer = NULL; + req->nr_sectors = 0; + req->bio = req->biotail = NULL; + blk_put_request(req); + kfree(ioctl_info); + return NULL; + } + req->nr_sectors = (ioctl_info->size + 511) >> 9; + + } else { + // PTB direct - we just need to remember the value + req->buffer = (char *) arg; // use the value directly - ecch. + ioctl_info->size = 0; + req->nr_sectors = 0; + } + // PTB failsafe + req->current_nr_sectors = req->hard_cur_sectors + = req->nr_sectors; + + /* PTB point the request buffer vaguely in the direction of where + * the data is, but it does not matter. + */ + req->rq_disk = lo->disk; + + // PTB we queue the request for treatment and wait till treated + init_completion(&ioctl_info->x); + req->waiting = &ioctl_info->x; + + /* PTB from scsi_ioctl.c + * we need an extra reference to the request, so we can look at + * it after io completion + */ + req->ref_count++; + + enbd_enqueue (lo, req); + + return req; +} + +/* + * Return nonnegative (timeleft) if done within timeout, otherwise it's not done + * (!x->done) and the return will be negative. + */ +static long +enbd_wait_for_completion_timeout (struct completion *x, long timeout) +{ + spin_lock_irq (&x->wait.lock); + if (!x->done && timeout > 0) { + DECLARE_WAITQUEUE (wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail (&x->wait, &wait); + do { + __set_current_state (TASK_INTERRUPTIBLE); + spin_unlock_irq (&x->wait.lock); + timeout = schedule_timeout (timeout); + spin_lock_irq (&x->wait.lock); + } while (!x->done && timeout > 0); + __remove_wait_queue (&x->wait, &wait); + } + if (x->done) { + x->done--; + if (timeout < 0) + timeout = 0; + } else { + if (timeout >= 0) + timeout = -1; + } + spin_unlock_irq (&x->wait.lock); + return timeout; +} + +/* + * Wait for an enqueued request to be completed. Return -ve on timeout or + * errored request, 0 on success. Remove request from queue in case of + * timeout in case it still is on queue. + */ +static int +enbd_wait_on_ioctl_timeout(struct request *req, unsigned long timeout) { + + struct enbd_ioctl_info *ioctl_info = req->special; + int err; + + if (!ioctl_info) { + ENBD_ERROR("received ioctl req %p without ioctl info\n", req); + return -EBADR; + } + + err = enbd_wait_for_completion_timeout(&ioctl_info->x, timeout); + // PTB now copy the request buffer to user space and then free it. + // (we didn't kill the ioctl_info structure). + if (err < 0) { + ENBD_ERROR ("remote ioctl timed out (%d remaining)\n", err); + //ENBD_ERROR ("remote ioctl errored remotely (%d)\n", err); + // PTB if we timeout then we assume responsibility for + // ending the request + } + return err; +} + + + +/* + * PTB this routine enqueues a new request that points to an ioctl data + * struct and then waits for it to be signalled completed. The + * completion is done in a special end_request (called from commit or + * error) and the request is there blk_put immediately afterwards (we + * blk_get it here), so the put's and get's will balance. The special + * end_request dismantles the ioctl data struct. + * + * Here we (usually) load the ioctl data struct with info copied + * from userspace prior to enqueueing it. It should get written back + * again just prior to the special end_request, elsewhere. + */ +static int +do_enbd_remote_ioctl(struct enbd_device *lo, int cmd, unsigned long arg) { + + unsigned start_time, timeout; + int err; + struct request * req; + struct enbd_ioctl_info *ioctl_info; + char * buffer; + + /* + * PTB here we have to treat remote ioctls. We should probably make + * a request and put it on the local queue, but where can we get + * the request from? We might have to keep one in reserve. + * That's not a bad idea, because + * we generate it here and we delete it here, and the daemon code + * is all set up to read that sort of thing. So that's what we do ... + */ + + timeout = lo->req_timeo * HZ; + start_time = jiffies; + + /* + * PTB this does the enqueue, and also loads the buffer and + * nr_sectors values in the request, and sets up the + * ioctl_info struct as the data of the request + */ + req = ioctl_make_request(lo, cmd, arg); + if (!req) { + ENBD_ERROR("could not make a remote ioctl (%#x) request\n", cmd); + return -EIO; + } + + // PTB if the req was made OK, it got a info struct attached + ioctl_info = req->special; + + // PTB save the request buffer here & don't kill it in end request + buffer = req->buffer; + + // PTB this waits for the request to be done + err = enbd_wait_on_ioctl_timeout(req, timeout); + + // PTB deal with timeout + if (err < 0) { + // PTB we timed out waiting for an answer + +/* + * PTB race between timeout and ack. Look for the request and lock the + * queue we find it on. We only look on the device queue here as it's + * likely to be there. If it's on the slots, then rollback will catch + * it. + */ + struct request * xreq = NULL; + struct list_head *pos; + + write_lock(&lo->queue_lock); + list_for_each (pos, &lo->queue) { + xreq = list_entry (pos, struct request, queuelist); + if (xreq == req) + break; + } + if (xreq != req) { + // PTB not found - it's on a slot or already finished + ENBD_ERROR ("timed out remote ioctl %p lost\n", req); + // PTB leave for rollback/error + write_unlock(&lo->queue_lock); + return -EINVAL; + } + ENBD_ALERT ("remote ioctl %p found on dev queue\n", req); + // PTB we have the dev queue lock + enbd_remove (lo, req); + write_unlock(&lo->queue_lock); + + //PTB give the completion - we were waiting fr it, but we + // woke up anyway on timeout! + enbd_end_ioctl_request(req); + kfree(ioctl_info); + + if (_IOC_DIR (ioctl_info->cmd) & _IOC_READ) { + + if (ioctl_info->size > 0) { + + req->buffer = NULL; + kfree(buffer); + // PTB removed and errored now. + } + } + + ENBD_ALERT ("timed out remote ioctl %p cleaned up\n", req); + blk_put_request(req); + return -ETIME; + } + + // PTB no time out - we are the rendezvous. Do the remaining + // transfer using the ioctl info. The req has been vamooshed. + + if (_IOC_DIR (ioctl_info->cmd) & _IOC_READ) { + // PTB need to know if the size was zero because then + // the buffer is not a valid address + if (ioctl_info->size > 0) { + /* + * PTB sectors is an overestimate. Should be + * OK as we are reading from the client + * buffer which has plenty of room to spare. + */ + err = indirect_ioctl_store(ioctl_info, buffer); + if (err < 0) { + ENBD_ERROR ("remote ioctl failed store\n"); + } + kfree(buffer); + } + } + err = ioctl_info->errors <= 0 ? ioctl_info->errors : -EINVAL; + + // PTB Apparently put_req is done in the commit or error that released + // the wait_on for rendezvous. All we have left is the ioctl info. + + if (err < 0) { + ENBD_ERROR("errored (%d) remote ioctl (%#x) %p\n", + err, ioctl_info->cmd, req); + } + kfree(ioctl_info); + + return err; +} + +static int +find_slot (struct enbd_device *lo, int pid) +{ + int i; + // go search + for (i = 0; i < ENBD_MAXCONN; i++) { + struct enbd_slot * slot = &lo->slots[i]; + if (slot->pid == pid) + break; + } + if (i < ENBD_MAXCONN) + return i; // found it + // not found + return -1; +} + +static int +fixup_slot (struct enbd_device *lo, unsigned int cmd, unsigned long *arg) +{ + int intval, islot; + + switch (cmd) { + + // PTB get slot info from parameter if not given + case ENBD_CLEAR_SOCK: + case MY_NBD_CLR_REQ: + case MY_NBD_ERR_REQ: + // see if we match a known slot pid + if (arg && *arg == 0) { + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + } + ENBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, arg ? *arg : 0); + return islot = -1; + break; + + // PTB get the slot from the 16 high bits + case ENBD_SET_SOCK: + case MY_NBD_SET_SPID: + if (!arg) + return islot = -1; + intval = *arg >> ((sizeof (int) - sizeof (short)) * 8); + intval &= (1 << (sizeof (short) * 8)) - 1; + if (intval == 0) { + // no clue in the pid high bits. Search + islot = find_slot (lo, current->pid); + if (islot >= 0) { + // PTB change arg !! + *arg &= (1 << (sizeof (short) * 8)) - 1; + return islot; // found it + } + // not found + } + ENBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, arg ? *arg : 0); + return islot = -1; + break; + + case MY_NBD_GET_REQ: + case MY_NBD_ACK: + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + ENBD_ALERT + ("failed to find slot for pid %d for ioctl %x arg %lx\n", + current->pid, cmd, arg ? *arg : 0); + return islot; + break; + + case MY_NBD_REG_BUF: + case MY_NBD_SET_SIG: + islot = find_slot (lo, current->pid); + if (islot >= 0) + return islot; + /* + * PTB Otherwise they passed a buffer + * and the slot number is in the first 4B + * We need some magic here for safety! + * set sig is the only call that really needs + * to send its pid! + */ + + if (!arg || !*arg) { + ENBD_ALERT + ("no arg supplied in call for pid %d ioctl %x arg %lx\n", + current->pid, cmd, arg?*arg:0); + return islot = -1; + } + intval = -1; + if (get_user (intval, (int *) *arg)) { + ENBD_ALERT + ("failed to read indirect user arg supplied in call for pid %d ioctl %x arg %lx\n", + current->pid, cmd, *arg); + return islot = -1; + } + if (intval <= 0 || intval > ENBD_MAXCONN) { + ENBD_ALERT + ("failed to find slot for pid %d ioctl %x arg %lx (%x)\n", + current->pid, cmd, *arg, intval); + return islot = -1; + } + islot = intval - 1; + + // PTB check for following magic after the sig + if (get_user (intval, + (int *)(ENBD_SIGLEN + (char *)(1 + (int *) *arg))) + || intval != ENBD_DEV_MAGIC) { + return islot = -1; + } + + // PTB Success. CHANGE ARG !!!! Skip slot designator + *arg += sizeof (int); + return islot; + + // PTB can't be reached - ends case + break; + } + + return islot = -1; +} + +/* + * PTB - generic ioctl handling + */ +static int +enbd_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct enbd_device *lo + = NULL; // PTB device pointer + int minor = -1; // PTB minor on which we got the ioctl + int islot = -1; // PTB slot number 0, 1, ... + int nbd = -1; // PTB the count for the device group + struct enbd_slot *slot + = NULL; // PTB slot pointer + int err; + + if (!capable(CAP_SYS_ADMIN)) { + ENBD_ERROR ("caller must be root.\n"); + return -EPERM; + } + if (!inode) { + ENBD_ERROR ("given bad inode.\n"); + return -EINVAL; + } + if (imajor (inode) != major) { + ENBD_ERROR ("pseudo-major %d != %d\n", + imajor (inode), major); + return -ENODEV; + } + minor = iminor (inode); + nbd = minor >> ENBD_SHIFT; + if (nbd >= MAX_NBD) { + ENBD_ERROR ("tried to open too many devices, %d\n", minor); + return -ENODEV; + } + lo = &enbd_dev[nbd]; + lo->harderror = 0; + islot = minor % ENBD_MAXCONN - 1; + + /* + * PTB fixup breakage >= 2.5.44 caused by not being allowed to talk to + * minors. We deduce the slot number from hints in the call. + * Or we match against the known pids. + */ + if (islot < 0) { + islot = fixup_slot(lo, cmd, &arg); + } + if (islot >= 0) + slot = & lo->slots[islot]; + + + // PTB these are all always local ioctls + switch (cmd) { + int err; + int intval; + int do_reenable; + + case ENBD_CLEAR_SOCK: + if (islot < 0) { + ENBD_ALERT ("CLEAR_SOCK called on full device nd%s arg %lx\n", + lo->devnam, arg); + return -EINVAL; + } + err = enbd_clr_sock (slot); + return err; + + case ENBD_SET_SOCK: + if (islot < 0) { + ENBD_ALERT ("SET_SOCK called on full device nd%s arg %lx\n", + lo->devnam, arg); + return -EINVAL; + } + err = enbd_set_sock (slot, arg); + return err; + + case BLKBSZGET: + // PTB The kernel should intercept this + ENBD_ALERT ("attempted get_blksize with BLKBSZGET\n"); + return -EINVAL; + + case ENBD_GET_BLKSIZE: + if (!(atomic_read (&lo->flags) & ENBD_BLKSIZED)) { + return -EINVAL; + } + err = put_user (lo->blksize, (long *) arg); + return err; + + case BLKBSZSET: + // PTB The kernel should have intercepted this + ENBD_ALERT ("attempted set_blksize with BLKBSZSET\n"); + return -EINVAL; + + case ENBD_SET_BLKSIZE: + if (!arg) + return -EINVAL; + intval = -1; + if (get_user (intval, (int *)arg)) + return -EFAULT; + if (intval == -1) { + ENBD_ALERT ("BLKBSZSET got %d from user\n", intval); + } + err = enbd_set_blksize (lo, intval); + return err; + + case ENBD_SET_SIZE: + err = enbd_set_size (lo, (__u64) arg); + return err; + + case ENBD_SET_SECTORS: + err = enbd_set_size (lo, ((__u64) arg) << 9); + return err; + + case MY_NBD_SET_INTVL: /* WG */ + err = enbd_set_intvl (lo, arg); + return err; + + case MY_NBD_SET_SPID: + if (islot < 0) { + ENBD_ALERT ("SET_SPID called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = enbd_set_spid (slot, arg); + return err; + + case MY_NBD_SET_BUFFERWR: + err = enbd_set_bufferwr (lo, arg); + return err; + + case MY_NBD_REG_BUF: /* PTB register your buffer per socket here */ + if (!arg) { + /* PTB this serves as existence check for this ioctl */ + return 0; + } + if (islot < 0) { + ENBD_ALERT ("REG_BUF called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = enbd_reg_buf (slot, (char *) arg); + return err; + + case MY_NBD_SET_SIG: + if (islot < 0) { + int sigbuf[(ENBD_SIGLEN + sizeof(int) -1)/sizeof(int)]; + // PTB set_sig called on whole device + if (copy_from_user((char *)sigbuf, (char *)arg, ENBD_SIGLEN)) { + ENBD_ERROR("could not copy sig from user\n"); + return -EFAULT; + } + err = enbd_set_sig (lo, sigbuf); + return err; + } + err = my_nbd_set_sig (slot, (int *) arg); + return err; + + case MY_NBD_GET_REQ: + if (islot < 0) { + ENBD_ALERT ("GET_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + if (arg < 4096) { + arg = (unsigned)slot->buffer; + if (!arg) + return -EINVAL; + } + err = enbd_get_req (slot); + return err; + + case MY_NBD_GET_NPORT: + err = enbd_get_nport (lo, (int *) arg); + return err; + + case MY_NBD_CLR_REQ: + if (islot < 0) { + ENBD_ALERT ("CLR_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + enbd_rollback_all (slot); + return 0; + + case MY_NBD_ERR_REQ: + if (islot < 0) { + ENBD_ALERT ("ERR_REQ called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + enbd_error_all (slot); + return 0; + + case ENBD_GET_MAXCONN: + err = put_user (ENBD_MAXCONN, (int *)arg); + return err; + + case MY_NBD_SYNC: + + // PTB maybe run the reenable function + do_reenable = 0; + write_lock(&lo->meta_lock); + if (lo->reenable_time != 0 + && time_before(lo->reenable_time,jiffies)) { + lo->reenable_time = 0; + do_reenable = 1; + } + write_unlock(&lo->meta_lock); + if (do_reenable) + enbd_reenable(lo); + + // PTB error too old reqs if show_errs set, else roll them back + enbd_rollback_old (lo); + + // PTB opportunity to calculate speed + enbd_set_speed (lo); + + return 0; + + case MY_NBD_ACK: + if (islot < 0) { + ENBD_ALERT ("ENBD_ACK called on full device nd%s\n", + lo->devnam); + return -EINVAL; + } + err = enbd_ack (slot); + return err; + + /* let this be compiled in always - it's useful. PTB */ + case ENBD_PRINT_DEBUG: + ENBD_INFO("device %d: hd = %p, tl = %p, in = %d, out = %d\n", + minor, + list_head (&lo->queue, struct request, queuelist), + list_tail (&lo->queue, struct request, queuelist), + atomic_read (&lo->requests_in[READ]) + + atomic_read (&lo->requests_in[WRITE]), + atomic_read (&lo->requests_out[READ]) + + atomic_read (&lo->requests_out[WRITE]) + ); + err = 0; + return err; + case ENBD_HARD_RESET: /* PTB - debugging */ + err = enbd_hard_reset (lo); + return err; + + case ENBD_RESET: /* PTB - debugging */ + err = enbd_soft_reset (lo); + // PTB we reenable in 5s + enbd_reenable_delay(lo, 5); + return err; + + case ENBD_SET_MD5SUM: /* PTB - change to do/plead md5summing */ + if (arg & 1) { + atomic_set_mask (ENBD_MD5SUM, &lo->flags); + if (arg != 1) { + write_lock(&lo->meta_lock); + lo->md5_off_threshold = (long)arg >> 1; + write_unlock(&lo->meta_lock); + } + } else { + atomic_clear_mask (ENBD_MD5SUM, &lo->flags); + if (arg != 0) { + write_lock(&lo->meta_lock); + lo->md5_on_threshold = (long)arg >> 1; + write_unlock(&lo->meta_lock); + } + } + err = 0; + return err; + + case MY_NBD_SET_SHOW_ERRS: /* PTB/WG - change show error status */ + if (arg) { + atomic_set_mask (ENBD_SHOW_ERRS, &lo->flags); + } else { + atomic_clear_mask (ENBD_SHOW_ERRS, &lo->flags); + } + return 0; + + case MY_NBD_SET_DIRECT: /* PTB - change o_direct status */ + if (arg) { + atomic_set_mask (ENBD_DIRECT, &lo->flags); + } else { + atomic_clear_mask (ENBD_DIRECT, &lo->flags); + } + return 0; + + case MY_NBD_INVALIDATE: + err = enbd_set_remote_invalid (lo, (int) arg); + return err; + + case ENBD_SET_PF_MEMALLOC: + if (arg) { + current->flags |= PF_MEMALLOC; + } else { + current->flags &= ~PF_MEMALLOC; + } + return 0; + } // PTB endsw + + // PTB these are the standard ioctls, and we might get them from + // the other side + + switch (cmd) { + int err; + int intval; + + case BLKROSET: /* PTB - change ro status */ + if (get_user(intval, (int*)arg)) + return -EFAULT; + // PTB local flags + enbd_set_read_only(lo, intval); + return 0; + + case BLKROGET: + intval = enbd_read_only(lo); + return put_user(intval, (int*)arg); + + case BLKFLSBUF: + enbd_maybe_sync_sync (lo); // PTB normally fsync_dev + return 0; + + case HDIO_GETGEO: + if (!arg) { + return -EINVAL; + } else { + struct hd_geometry *geo = + (struct hd_geometry *) arg; + int sectors = enbd_sizes[nbd << ENBD_SHIFT] << 1; + unsigned short c; + unsigned char h, s; + if (sectors < (1 << 22)) { + h = 4; + s = 16; + c = sectors >> 6; + } else { + h = 255; + s = 63; + c = (sectors / h) / s; + } + err = 0; + if ((err = put_user (c, &geo->cylinders), err < 0) + || (err = put_user (h, &geo->heads), err < 0) + || (err = put_user (s, &geo->sectors), err < 0) + || (err = put_user (h, &geo->start), err < 0)) { + return err; + } + } + return 0; + +#ifndef BLKMDNTFY +#define BLKMDNTFY _IOW(0x12,133,int) +#endif + case BLKMDNTFY: + ENBD_INFO ("received BLKMDNTFY, am now in raid %x\n", + (unsigned) arg); + err = enbd_md.inc(&enbd_md); + if (err < 0) + return err; + // PTB count the individual partition and whole disk inclusions + if (slot) + slot->md_count++; + atomic_inc(&lo->md_count); + if (!atomic_test_and_set_mask (&lo->flags, ENBD_SHOW_ERRS)) { + ENBD_INFO ("set show_errs on nd%s\n", lo->devnam); + atomic_set_mask (ENBD_RAID_SHOW_ERRS, &lo->flags); + } + return 0; + +#ifndef BLKMDUNTFY +#define BLKMDUNTFY _IOW(0x12,134,int) +#endif + case BLKMDUNTFY: + ENBD_INFO ("received BLKMDUNTFY, now out of raid %x\n", + (unsigned) arg); + // PTB count the individual partition and whole disk inclusions + if (slot) + slot->md_count--; + err = enbd_md.dec(&enbd_md); + if (err < 0) + return err; + if (atomic_dec_and_test(&lo->md_count) + && atomic_test_and_clear_mask(&lo->flags, ENBD_RAID_SHOW_ERRS)) { + ENBD_INFO ("cleared show_errs on nd%s\n", lo->devnam); + } + return 0; + +#ifndef BLKMDRGTR +#define BLKMDRGTR _IOW(0x12,135,unsigned long) +#endif + case BLKMDRGTR: + enbd_md.reg(&enbd_md, (int(*)(dev_t, int))arg); + return 0; + + } // PTB endsw + + if (enbd_remote_ioctl.remote != NULL) { + struct enbd_ioctl *remote_ioctl = enbd_remote_ioctl.remote; + + if (remote_ioctl->convert_inplace (&cmd) < 0) { + ENBD_ALERT ("unauthorized ioctl %#x\n", cmd); + return -EINVAL; + } + + err = do_enbd_remote_ioctl(lo, cmd, arg); + return err; + } + return -EINVAL; +} + +/* + * PTB - release the device. This happens when the last process closes + * or dies. + */ +static int +enbd_release (struct inode *inode, struct file *file) +{ + struct enbd_device *lo; + int dev; + int nbd; + int islot; + + if (!inode) { + ENBD_ALERT ("null inode.\n"); + return -ENODEV; + } + dev = iminor (inode); + nbd = dev >> ENBD_SHIFT; + + if (nbd >= MAX_NBD) { + // PTB impossible + ENBD_ALERT ("too many open devices.\n"); + return -ENODEV; + } + + lo = &enbd_dev[nbd]; + + islot = dev % ENBD_MAXCONN - 1; + + // PTB it is a daemon closing the slot? + if (islot >= 0 || (islot = find_slot(lo, current->pid), islot >= 0)) { + struct enbd_slot *slot = &lo->slots[islot]; + --slot->refcnt; + if (slot->pid == current->pid) { + + enbd_clr_sock (slot); + ENBD_ALERT ("(%d): erasing slot pid %d\n", islot, slot->pid); + slot->pid = 0; + if (slot->refcnt > 0) { + ENBD_ALERT + ("slot owner process %d released slot nd%s%d while not last\n", + slot->pid, lo->devnam, islot + 1); + } + } + } + + /* POSSIBLE change socket here PTB */ + + atomic_dec (&lo->refcnt); + + // PTB invalidate buffers on last close if show_err set + if (atomic_read (&lo->refcnt) <= 0 || !module_is_live(THIS_MODULE)) { + struct enbd_seqno * seqno_out = &lo->seqno_out; + + if (atomic_read (&lo->flags) & ENBD_SHOW_ERRS) { + // PTB added for 2.6.1 in 2.4.32 - experiment + enbd_set_remote_invalid (lo, 1); + } + // PTB in any case the daemons are dead! + lo->bufsiz = 0; + seqno_out->reset(seqno_out); + } + + if (file + && (file->f_flags & O_DIRECT) + // PTB we set this to show we made iobuf + && (file->f_flags & O_NOFOLLOW)) { + file->f_flags &= ~(O_DIRECT|O_NOFOLLOW); + } + + return 0; +} + +/* + * Just signal completion of a fake read block 0 request. + */ +static int +enbd_rb0_complete (struct bio *bio, unsigned int bytes_done, int err) +{ + struct completion * x = bio->bi_private; + ENBD_INFO ("complete validation (%d/%d bytes read, err %d)\n", + bytes_done, bytes_done + bio->bi_size, err); + + if (bio->bi_size) { + // PTB not finished yet + return 1; + } + + complete (x); // PTB signal + return 0; +} + +/* + * Read and throw away block zero + */ +static int +enbd_read_block_0 (struct enbd_device *lo) +{ + struct inode *inode = lo->inode; + struct block_device *bdev = inode ? inode->i_bdev : NULL; + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + struct page *page; + size_t size; + int err; + request_queue_t * q = bdev_get_queue (bdev); + + ENBD_INFO ("start validation of device nd%s\n", lo->devnam); + + if (!bdev) { + ENBD_ERROR ("oops, device nd%s has no bdev!\n", lo->devnam); + return -EINVAL; + } + + // critical region because of flag stuff + if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) { + ENBD_ERROR ("validate when device nd%s not enabled yet\n", + lo->devnam); + return -EINVAL; + } + if (atomic_test_and_set_mask (&lo->flags, ENBD_VALIDATING)) { + // PTB we forget to set the flag before launching + ENBD_ERROR ("validate when device nd%s already validating!\n", + lo->devnam); + return -EINVAL; + } + + ENBD_INFO ("will alloc page of memory\n"); + page = alloc_page (GFP_NOIO); + if (!page) { + ENBD_ERROR ("out of memory for new page!\n"); + atomic_clear_mask (ENBD_VALIDATING, &lo->flags); + return -ENOMEM; + } + ENBD_INFO ("got allocated page %p\n", page); + + size = bdev->bd_block_size; + if (!size) + size = 1024; + + bio_init (&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = size; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_sector = 0; + + ENBD_INFO ("attach completion %p to bio %p\n", &complete, &bio); + + init_completion (&complete); + bio.bi_private = &complete; + bio.bi_end_io = enbd_rb0_complete; + + // Jens Axboe says to get info this way + bio_get(&bio); + + ENBD_INFO ("will submit bio %p\n", &bio); + submit_bio (READ, &bio); + + ENBD_INFO ("will unplug device queue %p\n", q); + generic_unplug_device (q); + + ENBD_INFO ("will run request fn on device queue %p\n", q); + spin_lock_irq(q->queue_lock); + do_enbd_request (q); + spin_unlock_irq(q->queue_lock); + ENBD_INFO ("will wait for completion %p\n", &complete); + wait_for_completion (&complete); + + // Jens says to pick up information now + err = (!(bio.bi_flags & (1 << BIO_UPTODATE))) ? -EIO : 0; + bio_put(&bio); + + ENBD_INFO ("will clear VALIDATING flag on nd%s\n", lo->devnam); + atomic_clear_mask (ENBD_VALIDATING, &lo->flags); + + ENBD_INFO ("will free page %p\n", page); + __free_page (page); // PTB discard + + ENBD_INFO ("end validation (%d/%d bytes read) of device nd%s, ret %d\n", + size - bio.bi_size, size, lo->devnam, err); + return err; +} + +/* + * This is called by the kernel (via check_disk_change) whenever it + * opens the superblock of a FS. And we call it whenever there is + * an open of the device (provided we haven't already checked recently, + * as recorded by last_checked). Return 0 for no change and 1 for + * change. + */ +static int +enbd_media_changed(struct gendisk *disk) { + + struct enbd_device *lo = disk->private_data; + struct enbd_ioctl *remote_ioctl = enbd_remote_ioctl.remote; + int cmd, err; + + if (!lo || lo->magic != ENBD_DEV_MAGIC) + return 0; + + ENBD_ALERT("MEDIA CHANGED called\n"); + + // PTB want to send out a query over the net + cmd = ENBD_REMOTE_CHECK; + + if (!remote_ioctl) { + ENBD_ALERT("REMOTE CHECK done locally, no remote ioctl!\n"); + goto error_out; + } + if (remote_ioctl->convert_inplace(&cmd) < 0) { + ENBD_ALERT("REMOTE CHECK is flagged as unknown ioctl %#x!\n", + cmd); + goto error_out; + } + + // PTB try the first ioctl + err = do_enbd_remote_ioctl(lo, cmd, 0); + + + if (err < 0) { + // PTB try the second ioctl + err = do_enbd_remote_ioctl(lo, cmd, 1); + + if (err < 0) { + // PTB could have been timeout or unknown ioctl + ENBD_ALERT("REMOTE CHECK remote is down!\n"); + goto error_out; + } + // PTB second ioctl confirms remote is invalid + if (atomic_test_and_clear_mask(&lo->flags,ENBD_VALIDATED)) { + ENBD_ALERT("REMOTE CHECK set INVALID on nd%s\n", + lo->devnam); + } else { + ENBD_ALERT("REMOTE CHECK still has INVALID on nd%s\n", + lo->devnam); + } + if (!atomic_test_and_set_mask(&lo->flags,ENBD_REMOTE_INVALID)){ + ENBD_ALERT("REMOTE CHECK set REMOTE INVALID on nd%s\n", + lo->devnam); + //__invalidate_device(lo->inode->i_bdev, 0); + return 1; + } else { + ENBD_ALERT("REMOTE CHECK still has REMOTE INVALID on nd%s\n", + lo->devnam); + return 0; + } + } + + if (atomic_test_and_clear_mask (&lo->flags, ENBD_REMOTE_INVALID)) { + ENBD_ALERT("REMOTE CHECK set REMOTE VALID on nd%s\n", + lo->devnam); + return 1; + } else { + // PTB this is called by check_disk_change + //__invalidate_device(lo->inode->i_bdev, 0); + ENBD_ALERT("REMOTE CHECK still has REMOTE VALID on nd%s\n", + lo->devnam); + return 0; + } + + + // PTB remote ioctl returns 0 for success and we read the result + +error_out: + // PTB local reply. Remote is down. Say what we already know. + if (!(atomic_read (&lo->flags) & ENBD_VALIDATED)) { + /* PTB temporarily return to 2.4 semantics as this seems not to + * allow startup without enbd_ioctl in the kernel. + if (!atomic_test_and_set_mask(&lo->flags,ENBD_REMOTE_INVALID)) { + ENBD_ALERT("REMOTE CHECK set REMOTE INVALID on nd%s\n", + lo->devnam); + */ + return 1; + /* PTB temporarily return to 2.4 semantics as this seems not to + * allow startup without enbd_ioctl in the kernel. + } + return 0; + */ + } + /* PTB temporarily return to 2.4 semantics as this seems not to + * allow startup without enbd_ioctl in the kernel. + if (atomic_test_and_clear_mask(&lo->flags,ENBD_REMOTE_INVALID)) { + ENBD_ALERT("REMOTE CHECK set REMOTE VALID on nd%s\n", + lo->devnam); + return 1; + } + */ + return 0; +} + +/* + * Supposed to be called by the kernel to let us clean up after the removable + * device has come back. + */ +static int +enbd_revalidate(struct gendisk *disk) { + + struct enbd_device *lo = disk->private_data; + unsigned long flags; + + if (!lo || lo->magic != ENBD_DEV_MAGIC){ + return -EINVAL; + } + + ENBD_ALERT("REVALIDATE called on nd%s\n", lo->devnam); + + flags = atomic_read (&lo->flags); + if (flags & ENBD_REMOTE_INVALID) { + + ENBD_ALERT("asked to revalidate remotely dead drive nd%s\n", + lo->devnam); + return -ENODEV; + } + if (!(flags & ENBD_ENABLED)) { + + ENBD_ALERT("asked to revalidate disabled drive nd%s\n", + lo->devnam); + return -ENODEV; + } + + + // PTB read the first block + /* + if (enbd_read_block_0(lo) < 0) { + // PTB failed to validate it + if (atomic_test_and_clear_mask(&lo->flags, ENBD_VALIDATED)) { + ENBD_ALERT("set INVALID on nd%s\n", lo->devnam); + } else { + ENBD_ALERT("still INVALID on nd%s\n", lo->devnam); + } + return -ENODEV; + } + */ + + // PTB succeeded in validating + if (!atomic_test_and_set_mask (&lo->flags, ENBD_VALIDATED)) { + ENBD_ALERT("set VALID on nd%s\n", lo->devnam); + } else { + //ENBD_ALERT("still VALID on nd%s\n", lo->devnam); + } + + return 0; +} + +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) +/* + * MEDIA_LOCK, MEDIA_UNLOCK + * optarg == 0 - do not adjust usage count (compatibility) + * optarg == 1 - adjust usage count + */ +static int +enbd_mediactl (struct block_device *bdev, int op, int optarg) +{ + struct enbd_device *lo = &enbd_dev[iminor (bdev->bd_inode) >> ENBD_SHIFT]; + + switch (op) { + case MEDIA_LOCK: + case MEDIA_UNLOCK: + if (op == MEDIA_UNLOCK && optarg) { + atomic_dec (&lo->refcnt); + if (atomic_read (&lo->refcnt) < 0) + atomic_set (&lo->refcnt, 0); + } + if (supermount_usage_count (bdev, atomic_read (&lo->refcnt)) == 0) + ; // FIXME want lock_door(lo, (op == MEDIA_LOCK)) + if (op == MEDIA_LOCK && optarg) + atomic_inc (&lo->refcnt); + break; + default: + return -ENOSYS; + } + return 0; +} +#endif /* defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) */ + + + +static struct block_device_operations enbd_blkops = { + owner: THIS_MODULE, + open: enbd_open, + release: enbd_release, + ioctl: enbd_ioctl, + media_changed: enbd_media_changed, + revalidate_disk: enbd_revalidate, +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + mediactl: enbd_mediactl, +#endif /* defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) */ +}; + +/* + * PTB - For use in set_disk. + */ +static struct kobject * +enbd_find (dev_t dev, int *part, void *data) +{ + struct enbd_device *lo = data; + if (!lo) + return NULL; + if (lo->magic != ENBD_DEV_MAGIC) + return NULL; + if (!lo->disk) + return NULL; + if (part && (*part < 0 || *part >= ENBD_MAXCONN)) + return NULL; + return get_disk (lo->disk); +} + + +/* + * PTB - used in setup to fix the gendisk data, chain the gendisk, + * claim the blockdev region, etc. + */ +static int +enbd_set_disk (struct enbd_device *lo, unsigned first_minor, unsigned npart) +{ + struct gendisk *disk = lo->disk; + if (!disk) { + return -EINVAL; + } + disk->major = major; + disk->first_minor = first_minor; + disk->minors = npart; + disk->fops = &enbd_blkops; + disk->private_data = lo; + disk->queue = lo->q; + sprintf (disk->disk_name, "nd%s", lo->devnam); +#ifdef CONFIG_DEVFS_FS + sprintf (disk->devfs_name, "nd/%s", lo->devnam); +#endif /* CONFIG_DEVFS_FS */ + // have to set minors (or capacity) to 1 (0) to avoid check disk + set_capacity (disk, 0); + add_disk (disk); + blk_register_region (MKDEV (major, first_minor), + npart, THIS_MODULE, enbd_find, NULL, lo); + set_capacity (disk, lo->bytesize >> 9); + // we should rescan later. From userland? + return 0; +} + +/* + * Pavel - And here should be modules and kernel interface + * (Just smiley confuses emacs :-) + */ + +/* + * PTB - and now to play with the sysctl interface ... + */ +static int +enbd_proc_dobit (ctl_table * table, int write, struct file *filp, + void *buffer, size_t * lenp , loff_t *ppos) +{ + size_t len; + char *p, c; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + + short clear = 0, set = 0; + + len = 0; + p = buffer; + + // read from buffer until *lenp or end of entry + while (len < *lenp) { + if (get_user (c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + if (c == '0') { + if (set) { + set = 0; + break; + } + clear = 1; + break; + } + if (c == '1') { + clear = 0; + set = 1; + break; + } + if (c != ' ' && c != '\t' && c != '\r') { + clear = set = 0; + break; + } + len++; + } + + if (!set && !clear) { + return -EINVAL; + } + if (set) + atomic_set_mask ((unsigned long) table->extra1, + (atomic_t *) table->data); + if (clear) + atomic_clear_mask ((unsigned long) table->extra1, + (atomic_t *) table->data); + filp->f_pos += *lenp; + + } + else { + + // reading + len = 1; + if (len > *lenp) + len = *lenp; + if (len) { + char *sym = (atomic_read ((atomic_t *) table->data) & + (unsigned long) table->extra1) ? "1" : "0"; + if (copy_to_user(buffer, sym, len)) { + return -EFAULT; + } + } + if (len < *lenp) { + if (put_user ('\n', ((char *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + filp->f_pos += len; + } + return 0; +} + + +static ctl_table enbd_systable [MAX_NBD+1] = { + {0}, +}; + +static struct ctl_table_header *enbd_table_header; +// the above was set by the register call of the root table + +static ctl_table enbd_sysdefaultstable [] = { + {3, "sync", + &sync, sizeof (int), 0644, NULL, &proc_dointvec}, + {4, "merge_requests", + &merge_requests, sizeof (int), 0644, NULL, &proc_dointvec}, + {8, "md5_on_threshold", + &md5_on_threshold, sizeof (int), 0644, NULL, &proc_dointvec}, + {9, "md5_off_threshold", + &md5_off_threshold, sizeof (int), 0644, NULL, &proc_dointvec}, + {10, "md5_noauto", + &md5_noauto, sizeof (int), 0644, NULL, &proc_dointvec}, + {11, "md5sum", + &md5sum, sizeof (int), 0644, NULL, &proc_dointvec}, +#ifndef NO_BUFFERED_WRITES + {12, "buffer_writes", + &buffer_writes, sizeof (int), 0644, NULL, &proc_dointvec}, +#endif + {13, "show_errs", + &show_errs, sizeof (int), 0644, NULL, &proc_dointvec}, + {14, "req_timeo", + // PTB FIXME needs proc_doatomicintvec + &req_timeo, sizeof (int), 0644, NULL, &proc_dointvec}, + {0}, +}; + + +static ctl_table enbd_table[] = { + {1, "rahead", + &rahead, sizeof (int), 0644, NULL, &proc_dointvec}, + {2, "plug", + &plug, sizeof (int), 0644, NULL, &proc_dointvec}, + {13, "devices", + NULL, 0, 0555, enbd_systable, NULL, }, + {14, "defaults", + NULL, 0, 0555, enbd_sysdefaultstable, NULL, }, + {0} +}; +static ctl_table enbd_dir_table[] = { + {6, "enbd", NULL, 0, 0555, enbd_table}, + {0} +}; +static ctl_table enbd_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, enbd_dir_table}, + {0} +}; + +/* + * This has no dangerous race condition, as we disable the device + * rather than destroy it! + */ +static void +enbd_reset (struct enbd_device *lo) +{ + if (atomic_read(&lo->refcnt) > 0) { + ENBD_ALERT("enbd_reset called on open device\n"); + return; + } + enbd_sync_sync(lo); // takes no lock + write_lock(&lo->meta_lock); + if (atomic_read(&lo->refcnt) > 0) { + write_unlock(&lo->meta_lock); + ENBD_ALERT("enbd_reset called on open device\n"); + return; + } + atomic_clear_mask(ENBD_ENABLED, &lo->flags); + atomic_clear_mask(ENBD_VALIDATED, &lo->flags); + // PTB clear SIGNED suggested by Dag Sverre Seljebotn */ + atomic_clear_mask(ENBD_SIGNED, &lo->flags); + lo->nslot = 0; + write_unlock(&lo->meta_lock); + ENBD_ALERT("set INVALID on nd%s\n", lo->devnam); + //__invalidate_device(lo->inode->i_bdev, 0); + ENBD_ALERT ("reset device nd%s\n", lo->devnam); +} + +static int +proc_doblksize(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { + + const int offset = (char *)&((struct enbd_device *)NULL)->blksize + - (char *)NULL; + struct enbd_device *lo = (void *)((char*)buffer - offset); + int res; + int blksize = -1; + if (write) { + // PTB save + blksize = lo->blksize; + } + res = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write) { + int err; + if (res < 0) { + // PTB restore + lo->blksize = blksize; + return res; + } + if (blksize != lo->blksize) { + // PTB changed + int new_blksize = lo->blksize; + // PTB restore + lo->blksize = blksize; + err = enbd_set_blksize (lo, new_blksize); + if (err < 0) { + return err; + } + } + } + return res; +} + +static void +enbd_setup(struct enbd_device *lo, int i) { + + int j; + + if (i < 0 || i >= MAX_NBD) + return; + lo->magic = ENBD_DEV_MAGIC; + strncpy (lo->devnam, device_letter (i), 4); + for (j = 0; j < ENBD_MAXCONN; j++) { /* PTB */ + struct enbd_slot *slot = &lo->slots[j]; + slot->lo = lo; + slot->i = j; + INIT_LIST_HEAD (&slot->queue); + spin_lock_init(&slot->lock); + } + lo->blksize = 1024; /* PTB 132 */ + lo->logblksize = 10; /* PTB */ + lo->bytesize = 0x7fffffff00000LL; /* PTB 132 */ + lo->size = 0x7fffffff; /* PTB (bytesizes >> 10) */ + lo->sectors = 0xfffffffe; /* PTB sectors */ + lo->nbd = i; + lo->req_timeo = req_timeo; /* PTB default pulse intvl */ + lo->max_sectors = buf_sectors; + lo->md5_off_threshold = md5_off_threshold; + lo->md5_on_threshold = md5_on_threshold; + + lo->enable = enbd_enable; + lo->reset = enbd_reset; + lo->disable = enbd_disable; + lo->read_only = enbd_read_only; + lo->set_speed = enbd_set_speed; + lo->hard_reset = enbd_hard_reset; + lo->soft_reset = enbd_soft_reset; + lo->reenable_delay = enbd_reenable_delay; + + INIT_LIST_HEAD (&lo->queue); + init_waitqueue_head (&lo->wq); + init_waitqueue_head (&lo->req_wq); + init_MUTEX(&lo->pid_sem); + rwlock_init (&lo->queue_lock); + rwlock_init (&lo->meta_lock); + for (j = 0; j < ENBD_MAXCONN; j++) { + enbd_blksizes[i * ENBD_MAXCONN + j] = lo->blksize; + enbd_bytesizes[i * ENBD_MAXCONN + j] = lo->bytesize; + enbd_sizes[i * ENBD_MAXCONN + j] = lo->size; + enbd_max_sectors[i * ENBD_MAXCONN + j] = lo->max_sectors; + } + enbd_init_seqno(&lo->seqno_out); + enbd_init_speed(&lo->rspeed); + enbd_init_speed(&lo->wspeed); + enbd_init_speed(&lo->tspeed); + + // PTB queuue has alreay been initialized, or will be + lo->q = lo->disk ? lo->disk->queue : NULL; + + if (md5sum) { + atomic_set_mask (ENBD_MD5SUM, &lo->flags); + } + if (sync) { + atomic_set_mask (ENBD_SYNC, &lo->flags); + } + if (show_errs) { + atomic_set_mask (ENBD_SHOW_ERRS, &lo->flags); + } + if (direct) { + atomic_set_mask (ENBD_DIRECT, &lo->flags); + } + if (buffer_writes) { + atomic_set_mask (ENBD_BUFFERWR, &lo->flags); + } + if (merge_requests) { + atomic_set(&lo->merge_requests, merge_requests); + } + + lo->systable[0] = (ctl_table) {1, "md5_off_threshold", + &lo->md5_off_threshold, sizeof (int), 0644, + NULL, &proc_dointvec, }; + lo->systable[1] = (ctl_table) {2, "md5_on_threshold", + &lo->md5_on_threshold, sizeof (int), 0644, + NULL, &proc_dointvec, }; + lo->systable[2] = (ctl_table) {3, "md5_noauto", + &lo->flags, 1, 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_MD5SUM_NOAUTO, }; + lo->systable[3] = (ctl_table) {4, "merge_requests", + &lo->merge_requests, sizeof (int), 0644, + NULL, &proc_dointvec, }; + lo->systable[4] = (ctl_table) {5, "sync", + &lo->flags, sizeof (int), 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_SYNC, }; + lo->systable[5] = (ctl_table) {6, "show_errs", + &lo->flags, sizeof (int), 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_SHOW_ERRS, }; + lo->systable[6] = (ctl_table) {7, "buffer_writes", + &lo->flags, sizeof (int), 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_BUFFERWR, }; + lo->systable[7] = (ctl_table) {8, "md5sum", + &lo->flags, 1, 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_MD5SUM, }; + lo->systable[8] = (ctl_table) {9, "direct", + &lo->flags, 1, 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_DIRECT, }; + lo->systable[9] = (ctl_table) {10, "req_timeo", + &lo->req_timeo, sizeof (int), 0644, + NULL, &proc_dointvec, }; + lo->systable[10] = (ctl_table) {11, "blksize", + &lo->blksize, sizeof (lo->blksize), 0644, + NULL, &proc_doblksize, }; + lo->systable[11] = (ctl_table) {12, "enabled", + &lo->flags, sizeof (int), 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_ENABLED, }; + lo->systable[12] = (ctl_table) {13, "validated", + &lo->flags, sizeof (int), 0644, + NULL, &enbd_proc_dobit, NULL, NULL, (void *)ENBD_VALIDATED, }; + + lo->systable[sizeof(lo->systable)/sizeof(lo->systable[0]) - 1] + = (ctl_table) {0}; + +} + + + +#ifdef MODULE +MODULE_AUTHOR ("Peter T. Breuer, Andres Marin"); +MODULE_DESCRIPTION ("Enhanced Network Block Device " ENBD_VERSION); +MODULE_LICENSE ("GPL"); +#endif /* MODULE */ + + +/* PTB - + * These functions are needed when the kernel does request merging in + * order to stop it making requests that are bigger than our buffer. + * + * To turn OFF merging (once these functions are in place), set + * merge_requests=0. + */ +static int +enbd_merge_requests_fn (request_queue_t * q, struct request *req, + struct request *req2) +{ + struct enbd_device *lo = rq_get_enbd(req); + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!lo->ll_merge_requests_fn) + return 0; + + if (req->nr_sectors + req2->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors + req2->nr_sectors > + ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) + return 0; + + return lo->ll_merge_requests_fn (q, req, req2); +} +static int +enbd_front_merge_fn (request_queue_t * q, struct request *req, struct bio * bio) +{ + struct enbd_device *lo = rq_get_enbd(req); + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!lo->ll_front_merge_fn) + return 0; + + if (req->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors > ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) + return 0; + + return lo->ll_front_merge_fn (q, req, bio); +} +static int +enbd_back_merge_fn (request_queue_t * q, struct request *req, + struct bio * bio) +{ + struct enbd_device *lo = rq_get_enbd(req); + + if (!atomic_read(&lo->merge_requests)) + return 0; + + if (!lo->ll_back_merge_fn) + return 0; + + if (req->nr_sectors > lo->max_sectors) + return 0; + + if (req->nr_sectors > + ((atomic_read(&lo->merge_requests) + 1) << (lo->logblksize - 9))) return 0; + + return lo->ll_back_merge_fn (q, req, bio); +} + +#ifdef CONFIG_DEVFS_FS +static void +enbd_devfs_init(void) { + + int i; + + devfs_mk_dir ("nd"); + for (i = 0; i < MAX_NBD; i++) { + char diskname[15]; + struct enbd_device *lo = &enbd_dev[i]; + int j; + + // PTB make the subdirectory "a","b" etc. + devfs_mk_dir("nd/%s", lo->devnam); + + // PTB add the blk specials, "0","1" to ENBD_MAXCONN-1 + for (j = 0; j < MAX_NBD; j++) { + devfs_mk_bdev(MKDEV(major, i * ENBD_MAXCONN + j), + S_IFBLK | S_IRUSR | S_IWUSR, + "nd/%s/%u", lo->devnam, j); + } + + // PTB do the whole disk symlink .. + sprintf(diskname, "nd/%s/disk", lo->devnam); + devfs_mk_symlink (diskname, "0"); + // PTB .. and the channel symlinks + for (j = 1; j < MAX_NBD; j++) { + char link[4]; + char name[19]; + sprintf (link, "%u", j); + sprintf (name, "nd/%s/chan%u", lo->devnam, j); + devfs_mk_symlink (name, link); + } + } +} +#endif /* CONFIG_DEVFS_FS */ + +static void +enbd_init_queue (struct enbd_device *lo, struct request_queue *queue) +{ + +// PTB - set up kernel queue struct with default methods + blk_queue_max_sectors (queue, buf_sectors); /* max per request */ + +/* + * PTB - I think that put: + * - q->plug_device_fn = generic_plug_device (static ll_rw_blk) + * - q->plug_tq.routine = generic_unplug_device (static ll_rw_blk) + * - q->back_merge_fn = ll_back_merge_fn (static ll_rw_blk) + * - q->front_merge_fn = ll_front_merge_fn (static ll_rw_blk) + * - q->merge_requests_fn = ll_merge_requests_fn (static ll_rw_blk) + * - q->request_fn = do_enbd_request (param) + */ + +/* + * PTB - we have to do some more init magic in 2.4.*. This says that we + * - take all stuff off the kernel queue before processing it, so in + * - particular iti s OK for kernel to do merges with the queue head. + * blk_queue_headactive (enbd_queue, 0); + */ + +/* + * LA - moved the next #if higher; + * - kernel 2.2.* doesn't know about plug_device_fn + */ + + // PTB control merge attempts so we do not overflow our buffer + lo->ll_merge_requests_fn = queue->merge_requests_fn; + lo->ll_front_merge_fn = queue->front_merge_fn; + lo->ll_back_merge_fn = queue->back_merge_fn; + +// JSA - Add this line because under >=2.4.1, merge optimizations are in flux +/* + * PTB - however it's not this which does damage, I believe. Data: plugging + * - simply has to be enabled in these kernels. Without it, requests just + * - sit on the kernel queue and never come off and into our request_fn. + * PTB - commented the ifdef again after talks with Jens Axboe. + * - Apparently plug_fn will disappear in 2.4.4 and merge functions are + * the only way to control merges, so they MUST be included. + */ + +/* + * PTB - The functions below just impose our own stricter size limit before + * - calling the defaults if all seems OK sizewise. + */ + queue->merge_requests_fn = &enbd_merge_requests_fn; + queue->front_merge_fn = &enbd_front_merge_fn; + queue->back_merge_fn = &enbd_back_merge_fn; + +} + +static void +enbd_sysctl_init (void) +{ + int i; + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + enbd_systable[i] = (ctl_table){i+1, lo->devnam, + NULL, 0, 0555, lo->systable, NULL, }; + } + enbd_systable[MAX_NBD] = (ctl_table) { 0, }; + enbd_table_header = register_sysctl_table (enbd_root_table, 1); +} + +int __init +enbd_init (void) +{ + int i; + int err = 0; + struct proc_dir_entry *res; + + ENBD_INFO ("Network Block Device originally by pavel@elf.mj.gts.cz\n"); + ENBD_INFO ("Network Block Device port to 2.0 by ptb@it.uc3m.es\n"); + ENBD_INFO ("Network Block Device move networking to user space by " + "amarin@it.uc3m.es\n"); + ENBD_INFO ("Enhanced Network Block Device " ENBD_VERSION " by " + "ptb@it.uc3m.es\n"); + + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + /* Pavel says ... + * The new linux 2.5 block layer implementation requires + * every gendisk to have its very own request_queue + * struct. These structs are big so we dynamically allocate + * them. + */ + struct gendisk *disk = alloc_disk(ENBD_MAXCONN); + memset (lo, 0, sizeof (*lo)); + if (disk) { + lo->disk = disk; + spin_lock_init(&lo->lock); + disk->queue = blk_init_queue(do_enbd_request, &lo->lock); + if (!disk->queue) { + put_disk(disk); + while (--i >= 0) { + lo = &enbd_dev[i]; + disk = lo->disk; + if (!disk) + continue; + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); + } + return -ENOMEM; + } + enbd_init_queue(lo, disk->queue); + } + } + + + if (register_blkdev (major, "nbd")) { + ENBD_ERROR ("Unable to register major number %d for NBD\n", + major); + return -EIO; + } +#ifdef MODULE + ENBD_INFO ("registered device at major %d\n", major); +#endif + + + enbd_init_md(&enbd_md); + enbd_init_ioctl_stub(&enbd_remote_ioctl); + + /* + * PTB give initial values to struct params + */ + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + enbd_setup(lo, i); + } + + /* + * PTB we do the disk and partition stuff after we have + * contact, when enbd_open is called for the first time? + */ + + res = create_proc_read_entry ("nbdinfo", 0, NULL, NULL, NULL); + if (!res) { + ENBD_ALERT ("creation of proc entry failed\n"); + } else { + // PTB additional write_proc entry in struct + enbd_init_proc(res); + } + + // PTB fill in the gendisk structs very late. + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + enbd_set_disk(lo, i * ENBD_MAXCONN, ENBD_MAXCONN); + } + +#ifdef CONFIG_DEVFS_FS + enbd_devfs_init(); +#endif /* CONFIG_DEVFS_FS */ + + // PTB - sysctl interface + enbd_sysctl_init(); + + // PTB we have to wait for the open to complete init with inode val + + return err; +} + +void __exit +enbd_cleanup (void) +{ + int i; + + unregister_sysctl_table (enbd_table_header); + + remove_proc_entry ("nbdinfo", &proc_root); + + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = &enbd_dev[i]; + struct gendisk *disk = lo->disk; + atomic_clear_mask (ENBD_ENABLED, &lo->flags); + if (disk) { + struct request_queue *queue = disk->queue; + if (queue) { + blk_cleanup_queue (queue); + disk->queue = NULL; + } + del_gendisk (disk); + put_disk (disk); + } + if (lo->blockmap) { + kfree (lo->blockmap); + lo->blockmap = NULL; + } + enbd_systable[i] = (ctl_table) {0}; + enbd_sync_sync (lo); + } + +#ifdef CONFIG_DEVFS_FS + devfs_remove ("nd"); +#endif + + if (unregister_blkdev (major, "nbd") != 0) { + ENBD_ALERT ("cleanup_module failed\n"); + } else { + ENBD_INFO ("module cleaned up.\n"); + } + +} +module_init (enbd_init); +module_exit (enbd_cleanup); + +EXPORT_SYMBOL(enbd_remote_ioctl); + +/* Compile line: + + * gcc -O2 -D__KERNEL__ -DMODULE -DEXPORT_SYMTAB -xc -c enbd.c -o enbd.o + * + * (possibly with -DMODVERSIONS also). PTB + * (possibly with -I/usr/src/linux-x.y.z/include also). PTB + */ --- linux-2.6.7/drivers/block/enbd/enbd_ioctl.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_ioctl.c Fri Apr 9 23:49:08 2004 @@ -0,0 +1,232 @@ +#ifndef __KERNEL__ +#include +#include +#endif + +#include +#include +#include +#ifndef _CADDR_T +#define caddr_t char* +#endif +#include +#include +#include +#include +#include +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +#endif +#include +#include +#include +#include +#include + + +static struct ioctl_conv * +ioctl_lookup_old (int ioctl) +{ + int i; + unsigned old; + if (ioctl == -1) + return NULL; + for (i = 0; old = ioctl_conv_tab[i].old, old; i++) { + if (old == ioctl) + return &ioctl_conv_tab[i]; + } + // PTB not there + return NULL; +} + +static int +enbd_ioctl_convert (int ioctl) +{ + struct ioctl_conv *conv = ioctl_lookup_old (ioctl); + if (!conv) + // PTB not there + return -1; + return conv->new ? : ioctl; +} + +static int +enbd_ioctl_convert_inplace(int *ioctl) { + + int new_ioctl; + if (!ioctl) + return -EINVAL; + new_ioctl = enbd_ioctl_convert(*ioctl); + if (new_ioctl == -1) + return -EINVAL; + *ioctl = new_ioctl; + return 0; +} + +static struct ioctl_conv * +ioctl_lookup_new (int ioctl) +{ + int i = 0; + unsigned old; + for (i = 0; old = ioctl_conv_tab[i].old, old; i++) { + unsigned new = ioctl_conv_tab[i].new; + if (new == ioctl || (new == 0 && old == ioctl)) + return &ioctl_conv_tab[i]; + } + // PTB not there + return NULL; +} + +static int +enbd_ioctl_revert (int ioctl) +{ + struct ioctl_conv *conv = ioctl_lookup_new (ioctl); + if (!conv) + // PTB not there + return -1; + return conv->old; +} + +static struct ioctl_special * +ioctl_special_lookup_new (int ioctl) +{ + int i; + unsigned new; + for (i = 0; new = ioctl_special_tab[i].new, new; i++) { + if (new == ioctl) + return &ioctl_special_tab[i]; + } + // PTB not there + return NULL; +} + +static int +enbd_ioctl_size (int cmd, char *arg) +{ + int size = _IOC_SIZE (cmd); + if (size == _IOC_SIZEMASK) { + // PTB special hadling required. + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->size (arg); + } + return size; +} + +static int +enbd_ioctl_size_user (int cmd, char *arg) +{ + int size = _IOC_SIZE (cmd); + if (size == _IOC_SIZEMASK) { + // PTB special hadling required. + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->size_user (arg); + } + return size; +} + + +#ifdef __KERNEL__ +static int +enbd_ioctl_copy_to_user (int cmd, char *arg, char *buf, int size) + /* buf is the kernel and arg is in user space */ +{ + + if (_IOC_SIZE (cmd) == _IOC_SIZEMASK) { + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->ioctl_copy_to_user (arg, buf, size); + } + + if (_IOC_DIR (cmd) & _IOC_READ) { + // indirect + int rem = copy_to_user (arg, buf, size); + return size - rem; + } + + return -1; +} + + + +static int +enbd_ioctl_copy_from_user (int cmd, char *buf, char *arg, int size) +{ + + if (_IOC_SIZE (cmd) == _IOC_SIZEMASK) { + struct ioctl_special *special = ioctl_special_lookup_new(cmd); + if (!special) + return -1; + return special->ioctl_copy_from_user (buf, arg, size); + } + + if (_IOC_DIR (cmd) & _IOC_READ) { + // indirect + int rem = copy_from_user (buf, arg, size); + return size - rem; + } + + // direct + if (size > sizeof (arg)) { + return -1; + } + + memcpy (buf, &arg, size); + return size; +} + +static struct enbd_ioctl struct_ioctl = { + convert : enbd_ioctl_convert, + convert_inplace : enbd_ioctl_convert_inplace, + revert : enbd_ioctl_revert, + size : enbd_ioctl_size, + size_user : enbd_ioctl_size_user, + cp_to_user : enbd_ioctl_copy_to_user, + cp_from_user : enbd_ioctl_copy_from_user, +}; + +static int __init +enbd_ioctl_init (void) +{ + struct enbd_ioctl_stub * remote_ioctl = &enbd_remote_ioctl; + remote_ioctl->reg(remote_ioctl, &struct_ioctl); + return 0; +} + +static void __exit +enbd_ioctl_cleanup (void) { + struct enbd_ioctl_stub * remote_ioctl = &enbd_remote_ioctl; + remote_ioctl->unreg(remote_ioctl, &struct_ioctl); +} + +module_init (enbd_ioctl_init); +module_exit (enbd_ioctl_cleanup); + + +#ifdef MODULE +int linux_version_code = LINUX_VERSION_CODE; + #if LINUX_VERSION_CODE > KERNEL_VERSION(2,1,0) + MODULE_AUTHOR ("Peter T. Breuer"); + MODULE_DESCRIPTION ("Enhanced Network Block Device Remote Ioctl"); + #ifdef MODULE_LICENSE + MODULE_LICENSE("GPL"); + #endif + #endif +#endif /* MODULE */ + +#endif /* __KERNEL__ */ + + +/* +static +int ioctl_init(struct ioctl_conv *self, int old, int new) { + self->old = old; + self->new = new; + self->serialize = ioctl_serialize; + self->deserialize = ioctl_deserialize; + self->size = ioctl_size; +} +*/ --- linux-2.6.7/drivers/block/enbd/enbd_ioctl_stub.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_ioctl_stub.c Mon Jan 19 03:28:23 2004 @@ -0,0 +1,31 @@ + +#include +#include +#include +#include +#include + + /* + * PTB this is the hook for the enbd_ioctl extra module + */ + static int register_remote_ioctl(struct enbd_ioctl_stub *remote_ioctl, struct enbd_ioctl *x) { + if (!remote_ioctl->remote) { + remote_ioctl->remote = x; + return 0; + } + return -EINVAL; + } + static int unregister_remote_ioctl(struct enbd_ioctl_stub *remote_ioctl, struct enbd_ioctl *x) { + if (remote_ioctl->remote != x) + return -EINVAL; + remote_ioctl->remote = NULL; + return 0; + } + +int enbd_init_ioctl_stub(struct enbd_ioctl_stub *remote_ioctl) { + memset(remote_ioctl, 0, sizeof(*remote_ioctl)); + remote_ioctl->reg = register_remote_ioctl; + remote_ioctl->unreg = unregister_remote_ioctl; + return 0; +} + --- linux-2.6.7/drivers/block/enbd/enbd_md.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_md.c Mon Jan 19 03:29:36 2004 @@ -0,0 +1,104 @@ +#include +#include +#include + +/* + * PTB small driver wide support database for MDRGTR ioctl + */ + + + +#ifndef HOT_ADD_DISK + #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#endif +#ifndef SET_DISK_FAULTY + #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#endif + +static int +enbd_md_dec (struct enbd_md *md) { + int res; + spin_lock(&md->access_lock); + if (md->doing_notify && md->notify_pid != current->pid) { + // PTB don't add if we didn't remove + spin_unlock (&md->access_lock); + return -EBUSY; + } + if (md->count <= 0) + return -EINVAL; + if ((res = --md->count) <= 0) + md->notify_fn = NULL; + spin_unlock(&md->access_lock); + return res; +} +static int +enbd_md_inc (struct enbd_md *md) { + int res; + spin_lock(&md->access_lock); + if (md->doing_notify && md->notify_pid != current->pid) { + // PTB don't add if we didn't remove + spin_unlock (&md->access_lock); + return -EBUSY; + } + res = ++md->count; + spin_unlock(&md->access_lock); + return res; +} +static int +enbd_md_reg (struct enbd_md *md, int(*fn)(dev_t, int)) { + spin_lock(&md->access_lock); + if (!md->notify_fn) { + md->notify_fn = fn; + md->count++; + } + spin_unlock(&md->access_lock); + return 0; +} + +/* + * PTB tell md devices in which we are embedded that we are alright + * + * @lo the nbd device to tell them about + */ +static int +enbd_notify_md_device (struct enbd_md *md, dev_t enbd_dev, int cmd) +{ + int err; + + spin_lock (&md->access_lock); + if (md->count > 0 && md->notify_fn) { + md->doing_notify = 1; + md->notify_pid = current->pid; + spin_unlock (&md->access_lock); + ENBD_ALERT ("notifying %d for %x:%x to raid devices via fn\n", + cmd, MAJOR(enbd_dev), MINOR(enbd_dev)); + err = md->notify_fn (enbd_dev, cmd); + spin_lock (&md->access_lock); + md->doing_notify = 0; + md->notify_pid = -1; + if (err < 0) { + ENBD_ALERT ("ioctl %d to raid devices returned %d\n", + cmd, err); + } + } + spin_unlock (&md->access_lock); + + return 0; +} + + + +void +enbd_init_md(struct enbd_md *md) +{ + md->notify_fn = NULL; + md->count = 0; + md->doing_notify = 0; + md->notify_pid = -1; + spin_lock_init(&md->access_lock); + md->notify = enbd_notify_md_device; + md->dec = enbd_md_dec; + md->inc = enbd_md_inc; + md->reg = enbd_md_reg; +} + --- linux-2.6.7/drivers/block/enbd/enbd_proc.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_proc.c Tue Mar 16 22:05:40 2004 @@ -0,0 +1,1114 @@ + +#include +#include +#include + +extern struct enbd_device * enbd_get(int i); + + + + + +static void +do_generic_stuff (long x, int i, int (*f)(void *)) { + + void do_f (void) { + struct enbd_device *lo = enbd_get(i); + if (x != 0) { + f (lo); + return; + }; + }; + if (i >= 0 && i < MAX_NBD) { + do_f (); + return; + } +} + +static void +do_soft_reset (long reset, int i) { + + int f(struct enbd_device *lo) { + int res = lo->soft_reset(lo); + lo->reenable_delay(lo, 5); + return res; + } + do_generic_stuff(reset, i, (int(*)(void *))f); +} +static void +do_hard_reset (long reset, int i) { + + int f(struct enbd_device *lo) { + int res = lo->hard_reset(lo); + return res; + } + do_generic_stuff(reset, i, (int(*)(void *))f); +} +static void +do_reset (long reset, int i) { + + int f(struct enbd_device *lo) { + lo->reset(lo); + return 0; + } + do_generic_stuff(reset, i, (int(*)(void *))f); +} + +/* + * PTB This is just to get a nice limited width integer printout in proc! + * use endpos (<= 8) spaces at most. We serve from a static buffer size 16. + */ +static char * +display (unsigned n, int endpos) +{ + // PTB use endpos (<= 8) spaces at most + static char buf[16]; + int units = 0; + int decimals = 0; + int decpos = endpos; + int wholepart = n, fractionpart = 0; + buf[endpos--] = 0; + // PTB find the right units to display. U or K or M or G. + while (n >= 1 << 10) { + decimals = n & ((1 << 10) - 1); + n >>= 10; + units++; + } + switch (units) { + case 0: + break; + case 1: + buf[endpos--] = 'K'; + break; + case 2: + buf[endpos--] = 'M'; + break; + case 3: + buf[endpos--] = 'G'; + break; + case 4: + buf[endpos--] = 'T'; + break; + } + // after this wholepart = n && fractionpart = decimals + fractionpart = wholepart & ((1 << (units * 10)) - 1); + wholepart >>= units * 10; + // PTB write the whole digits (something between 0 and 1023 inclusive) + if (n == 0) { + buf[endpos--] = '0'; + } else { + while (endpos >= 0 && n > 0) { + buf[endpos--] = '0' + n % 10; + n /= 10; + } + } + // PTB if there is space and cause, add decimal digits + if (endpos >= 1 && units > 0) { + int k = 0; + char unitchar = buf[--decpos]; + buf[decpos + k++] = '.'; + while (endpos >= k) { + int digit = (decimals * 10) >> 10; + buf[decpos + k++] = '0' + digit; + decimals -= (digit << 10) / 10; + decimals *= 10; + } + buf[decpos + k++] = unitchar; + buf[decpos + k] = 0; + } + // PTB report the start position + return buf + endpos + 1; +} + + +static void +set_generic_flag (long x, int i, int mask) +{ + void set_x (void) { + struct enbd_device *lo = enbd_get(i); + if (lo->magic != ENBD_DEV_MAGIC) + return; + if (x != 0) { + atomic_set_mask (mask, &lo->flags); + return; + }; + atomic_clear_mask (mask, &lo->flags); + }; + + if (i >= 0 && i < MAX_NBD) { + set_x (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_x (); + } +} + +static void +set_sync_intvl (long sync_intvl, int i) +{ + set_generic_flag(sync_intvl, i, ENBD_SYNC); +} + + +static void +set_show_errs (long show_errs, int i) +{ + set_generic_flag(show_errs, i, ENBD_SHOW_ERRS); +} + +static void +set_md5sum (long md5sum, int i) +{ + set_generic_flag(md5sum, i, ENBD_MD5SUM); +} + +static void +set_md5sum_noauto (long md5sum_noauto, int i) +{ + set_generic_flag(md5sum_noauto, i, ENBD_MD5SUM_NOAUTO); +} + +static void +set_enable (long enable, int i) +{ + void set_e (void) { + struct enbd_device *lo = enbd_get(i); + if (!lo || lo->magic != ENBD_DEV_MAGIC) + return; + if (enable != 0) { + if (!(atomic_read (&lo->flags) & ENBD_ENABLED)) { + lo->enable (lo); + return; + } + }; + lo->disable (lo); + }; + + if (i >= 0 && i < MAX_NBD) { + set_e (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_e (); + } +} + +static void +set_direct (long direct, int i) +{ + set_generic_flag(direct, i, ENBD_DIRECT); +} + +#ifndef NO_BUFFERED_WRITES +static void +set_buffer_writes (long buffer_writes, int i) +{ + set_generic_flag(buffer_writes, i, ENBD_BUFFERWR); +} +#endif + +static void +set_merge_requests (long mr, int i) +{ + void set_mr (void) { + struct enbd_device *lo = enbd_get(i); + if (lo->magic != ENBD_DEV_MAGIC) + return; + atomic_set (&lo->merge_requests, mr); + } + + if (i >= 0 && i < MAX_NBD) { + set_mr (); + return; + } + for (i = 0; i < MAX_NBD; i++) { + set_mr (); + } +} + +static int +enbd_read_proc (char *buf, char **start, off_t offset, int len, int *eof, + void *data) +{ + +#ifndef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#endif + + const int limit = MIN (PAGE_SIZE, len) - 80; + static int i; + struct enbd_device *lo; + static int last; + static void *next_label; + static char *next_label_name; + static int total; + unsigned long flags; + + if (offset > 0 && !next_label) { + // PTB we have finished + *eof = 1; + *start = (char*)1; // PTB method (1) of (0-2) in generic.c + return 0; + } + + if (offset <= 0) { + // PTB do static inits first time through + last = -1; + i = 0; + next_label = NULL; + next_label_name = NULL; + total = 0; + } + + // PTB start this bytecount + len = 0; + +#define ENBD_PROC_LABEL(n) \ + next_label = &&label_##n; \ + next_label_name = "label_" #n; \ + if (len > limit) { \ + *start = (char *) (unsigned long) (len > 0 ? len : 1); \ + total += len; \ + return len;\ + } \ + label_##n: + + for ( /* static init */ ; i < MAX_NBD; i++) { + + char *devnam; + + lo = enbd_get(i); + devnam = lo->devnam; + if (lo->nslot <= 0) { + next_label = NULL; + continue; + } + + // PTB computed goto next not-done + if (next_label) { + void *label = next_label; + next_label = NULL; + next_label_name = NULL; + len = 0; + goto *label; + } + + ENBD_PROC_LABEL (1); + + if (last == i - 2) { + struct enbd_device * lo = enbd_get (i - 1); + char *prevdevnam = lo->devnam; + len += + sprintf (buf + len, "Device %s:\tClosed\n", + prevdevnam); + } + if (last < i - 2) { + struct enbd_device * llo = enbd_get (last + 1); + struct enbd_device * plo = enbd_get (i - 1); + char lastdevnam[3]; + char prevdevnam[3]; + strncpy (lastdevnam, llo->devnam, 3); + strncpy (prevdevnam, plo->devnam, 3); + len += + sprintf (buf + len, "Device %s-%s:\tClosed\n", + lastdevnam, prevdevnam); + } + + ENBD_PROC_LABEL (2); + + len += + sprintf (buf + len, "Device %s:\tOpen " "\n", devnam); + + ENBD_PROC_LABEL (3); + + len += sprintf (buf + len, + "[%s] State:\t%s%s%s%s%s%s%s%s%s%s%s%slast error %d, lives %d, bp %d\n", + devnam, atomic_read (&lo->flags) + & ENBD_INITIALISED ? "" : "uninitialized, ", + atomic_read (&lo->flags) + & ENBD_WRITE_NOCHK ? "noverify, " : + "verify, ", lo->read_only(lo) ? "ro, " : "rw, ", + atomic_read(&lo->merge_requests) ? "merge requests, " : "", +#ifndef NO_BUFFERED_WRITES + atomic_read (&lo->flags) + & ENBD_BUFFERWR ? "buffer writes, " : "", +#else + "", +#endif /* NO_BUFFERED_WRITES */ + atomic_read (&lo->flags) + & ENBD_ENABLED ? "enabled, " : "disabled, ", + atomic_read (&lo->flags) + & ENBD_VALIDATED ? "validated, " : "", + atomic_read (&lo->flags) + & ENBD_REMOTE_INVALID ? "remote invalid, " : "", + atomic_read (&lo->flags) + & ENBD_SHOW_ERRS ? "show_errs, " : "", + atomic_read (&lo->flags) + & ENBD_DIRECT ? "direct, " : "", + atomic_read (&lo->flags) + & ENBD_SYNC ? "sync, " : "", + atomic_read (&lo->flags) + & ENBD_MD5SUM ? "md5sum, " : "", + lo->harderror, + lo->lives - + ((atomic_read (&lo->flags) & ENBD_ENABLED) ? + 1 : 0), 0 //atomic_read(&buffermem_pages) + ); + + ENBD_PROC_LABEL (4); + + do { // PTB begin long do once block + int countq[2] = { 0, 0 }; + int cmd; + + struct list_head *pos; + + read_lock_irqsave (&lo->queue_lock, flags); + + list_for_each (pos, &lo->queue) { + struct request *req = + list_entry (pos, struct request, queuelist); + if (countq[READ] + countq[WRITE] > 1000) + break; + + cmd = rq_data_dir (req); + countq[cmd]++; + } + + read_unlock_irqrestore (&lo->queue_lock, flags); + + len += sprintf (buf + len, + "[%s] Queued:\t+%dR/%dW curr (check %dR/%dW) +%dR/%dW max\n", + devnam, + atomic_read (&lo->countq[READ]), + atomic_read (&lo->countq[WRITE]), + countq[READ], countq[WRITE], + atomic_read (&lo->maxq[READ]), + atomic_read (&lo->maxq[WRITE])); + } while (0); // PTB end long do once block + + ENBD_PROC_LABEL (5); + + len += sprintf (buf + len, + "[%s] Buffersize:\t%d\t(sectors=%d, blocks=%d)\n", + devnam, lo->bufsiz, lo->max_sectors, + lo->max_sectors / (lo->blksize >> 9)); + len += + sprintf (buf + len, "[%s] Blocksize:\t%d\t(log=%d)\n", + devnam, lo->blksize, lo->logblksize); + len += + sprintf (buf + len, "[%s] Size:\t%luKB\n", devnam, + (unsigned long) (lo->bytesize >> 10)); + len += + sprintf (buf + len, "[%s] Blocks:\t%u\n", devnam, + lo->size >> (lo->logblksize - 10)); + + ENBD_PROC_LABEL (6); + + len += + sprintf (buf + len, "[%s] Sockets:\t%d", devnam, + lo->nslot); + + ENBD_PROC_LABEL (7); + + do { // PTB begin short do once block + int j; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + if (j != atomic_read (&lo->islot)) { + len += + sprintf (buf + len, "\t(%s)", + slotj->file ? "+" : "-"); + } else { + len += + sprintf (buf + len, "\t(%s)", + slotj->file ? "*" : "."); + } + } + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + + ENBD_PROC_LABEL (8); + + len += sprintf (buf + len, "[%s] Requested:\t%s", devnam, + display (atomic_read + (&lo->requests_in[READ]) + + atomic_read (&lo->requests_in + [WRITE]), 7)); + + ENBD_PROC_LABEL (9); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->in, 5)); + } + strncpy (buff[0], + display (atomic_read + (&lo->requests_in[READ]), 6), 7); + strncpy (buff[1], + display (atomic_read + (&lo->requests_in[WRITE]), 6), + 7); + len += + sprintf (buf + len, "\t%sR/%sW", buff[0], + buff[1]); + lo->set_speed (lo); + len += sprintf (buf + len, "\tmax %d", + atomic_read (&lo->maxreqblks)); + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + len += sprintf (buf + len, "[%s] Despatched:\t%s", devnam, + display (atomic_read + (&lo->requests_out[READ]) + + atomic_read (&lo->requests_out + [WRITE]), 7)); + + ENBD_PROC_LABEL (10); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->out, 5)); + } + strncpy (buff[0], + display (atomic_read + (&lo->requests_out[READ]), 6), + 7); + strncpy (buff[1], + display (atomic_read + (&lo->requests_out[WRITE]), 6), + 7); + len += + sprintf (buf + len, "\t%sR/%sW", buff[0], + buff[1]); + len += + sprintf (buf + len, "\tmd5 %sW", + display (atomic_read + (&lo->wrequests_5to), 5)); + len += + sprintf (buf + len, " (%s eq,", + display (atomic_read + (&lo->wrequests_5so), 5)); + len += + sprintf (buf + len, " %s ne,", + display (atomic_read + (&lo->wrequests_5wo), 5)); + len += + sprintf (buf + len, " %s dn)", + display (atomic_read + (&lo->wrequests_5eo), 5)); + } while (0); // PTB end short do once block + + len += sprintf (buf + len, "\n"); + len += sprintf (buf + len, "[%s] Errored:\t%s", devnam, + display (atomic_read (&lo->requests_err), + 7)); + + ENBD_PROC_LABEL (11); + + do { // PTB begin short do once block + int j; + char buff[2][8]; + int toterrs = 0; + + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%s)", + display (slotj->err, 5)); + toterrs += slotj->err; + } + strncpy (buff[0], display (toterrs, 6), 7); + strncpy (buff[1], + display (atomic_read (&lo->requests_err) - + toterrs, 6), 7); + len += + sprintf (buf + len, "\t%s+%s\n", buff[0], + buff[1]); + } while (0); // PTB end short do once block + + ENBD_PROC_LABEL (12); + + do { // PTB begin long do once block + int pending_rblks = 0; /* PTB reads not reached the slots yet */ + int pending_wblks = 0; /* PTB writes not reached the slots yet */ + int blks = 0; + + read_lock_irqsave (&lo->queue_lock, flags); + + do { // PTB begin short do once block + struct list_head *pos; + + int count = 0; + struct request *req; + + list_for_each (pos, &lo->queue) { + req = + list_entry (pos, struct request, + queuelist); + if (count++ > 1000) + break; + blks = req->nr_sectors / lo->blksize; + if (blks > 0) { + switch (rq_data_dir (req)) { + case READ: + pending_rblks += + blks; + break; + case WRITE: + pending_wblks += + blks; + break; + } + } + } + } while (0); // PTB end short do once block + + read_unlock_irqrestore (&lo->queue_lock, flags); + len += + sprintf (buf + len, "[%s] Pending:\t%d", devnam, + atomic_read (&lo->requests_req[READ]) + + atomic_read (&lo->requests_req[WRITE])); + + do { // PTB begin short do once block + int j; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = + &lo->slots[j]; + len += + sprintf (buf + len, "\t(%d)", + slotj->req); + } + } while (0); // PTB end short do once block + + len += sprintf (buf + len, + "\t%dR/%dW+%dR/%dW\n", + atomic_read (&lo->requests_req[READ]), + atomic_read (&lo->requests_req[WRITE]), + pending_rblks, pending_wblks); + + } while (0); // PTB end long do once block + + ENBD_PROC_LABEL (13); + + do { // PTB begin long do once block + char buff[10][8]; + int shift = lo->logblksize; + + strncpy (buff[0], + display (atomic_read (&lo->wspeed.speed) + << shift, 5), 7); + strncpy (buff[1], + display (atomic_read (&lo->wspeed.speedav) + << shift, 5), 7); + strncpy (buff[2], + display (atomic_read + (&lo->wspeed.speedmax) << shift, + 5), 7); + + strncpy (buff[3], + display (atomic_read (&lo->rspeed.speed) + << shift, 5), 7); + strncpy (buff[4], + display (atomic_read (&lo->rspeed.speedav) + << shift, 5), 7); + strncpy (buff[5], + display (atomic_read + (&lo->rspeed.speedmax) << shift, + 5), 7); + + strncpy (buff[6], + display (atomic_read (&lo->tspeed.speed) + << shift, 5), 7); + strncpy (buff[7], + display (atomic_read (&lo->tspeed.speedav) + << shift, 5), 7); + strncpy (buff[8], + display (atomic_read + (&lo->tspeed.speedmax) << shift, + 5), 7); + + len += + sprintf (buf + len, "[%s] B/s now:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[6], + buff[3], buff[0]); + len += + sprintf (buf + len, "[%s] B/s ave:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[7], + buff[4], buff[1]); + len += + sprintf (buf + len, "[%s] B/s max:", devnam); + len += + sprintf (buf + len, "\t%s\t(%sR+%sW)\n", buff[8], + buff[5], buff[2]); + } while (0); // PTB end long do once block + + do { // PTB begin short do once block + int blks; + int tot_reqs = 0; + + len += + sprintf (buf + len, "[%s] Spectrum:", devnam); + for (blks = 0; + blks <= atomic_read (&lo->maxreqblks); blks++) { + tot_reqs += + atomic_read (&lo->req_in[READ][blks]) + + atomic_read (&lo->req_in[WRITE][blks]); + } + + for (blks = 0; + blks <= atomic_read (&lo->maxreqblks); blks++) { + int req_blks = + atomic_read (&lo->req_in[READ][blks]) + + atomic_read (&lo->req_in[WRITE][blks]); + int percent = + tot_reqs > + 0 ? (100 * req_blks) / tot_reqs : 0; + if (percent <= 0) + continue; + len += + sprintf (buf + len, "\t%u%%%d", percent, + blks); + } + len += sprintf (buf + len, "\n"); + } while (0); // PTB end short do once block + + ENBD_PROC_LABEL (14); + + len += sprintf (buf + len, "[%s] Kthreads:\t%d", devnam, + atomic_read (&lo->kthreads)); + len += + sprintf (buf + len, "\t(%d waiting/%d running/%d max)\n", + atomic_read (&lo->kwaiters), + atomic_read (&lo->kthreads) - + atomic_read (&lo->kwaiters), + atomic_read (&lo->kmax)); + + ENBD_PROC_LABEL (15); + + len += sprintf (buf + len, "[%s] Cthreads:\t%d", devnam, + atomic_read (&lo->cthreads)); + + ENBD_PROC_LABEL (16); + + do { + int j; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + int state = + ((atomic_read(&slotj->flags) & ENBD_SLOT_RUNNING) ? 1 : + 0) + + ((atomic_read(&slotj->flags) & ENBD_SLOT_WAITING) ? 2 : + 0); + char *desc = "?"; + switch (state) { + case 0: + desc = "-"; + break; /* PTB not in */ + case 1: + desc = "*"; + break; /* PTB in and not waiting */ + case 2: + desc = "?"; + break; /* PTB impossible */ + case 3: + desc = "+"; + break; /* PTB in and waiting */ + } + len += sprintf (buf + len, "\t(%s)", desc); + } + } while (0); + + len += sprintf (buf + len, "\n"); + + ENBD_PROC_LABEL (17); + + last = i; + len += sprintf (buf + len, "[%s] Cpids:\t%d", devnam, + atomic_read (&lo->cthreads)); + + do { + int j; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + len += + sprintf (buf + len, "\t(%u)", slotj->pid); + } + len += sprintf (buf + len, "\n"); + } while (0); + + do { + int j, k; + for (j = 0; j < lo->nslot; j++) { + struct enbd_slot *slotj = &lo->slots[j]; + if (slotj->spid != 0) + break; + } + if (j < lo->nslot) { + len += + sprintf (buf + len, "[%s] Kpids:\t%d", + devnam, + atomic_read (&lo->cthreads)); + for (k = 0; k < lo->nslot; k++) { + struct enbd_slot *slotk = + &lo->slots[k]; + len += + sprintf (buf + len, "\t(%u)", + slotk->spid); + } + len += sprintf (buf + len, "\n"); + } + } while (0); + + ENBD_PROC_LABEL (18); + + ENBD_PROC_LABEL (19); + + // PTB have to tell loop head that we are not reentering + next_label = NULL; + next_label_name = NULL; + } + + ENBD_PROC_LABEL (20); + + if (last == i - 2) { + struct enbd_device * lo = enbd_get (i - 1); + char *prevnam = lo->devnam; + len += + sprintf (buf + len, "Device %s:\tClosed\n", prevnam); + } + + if (last < i - 2) { + char lastnam[3]; + char prevnam[3]; + struct enbd_device * llo = enbd_get (last + 1); + struct enbd_device * plo = enbd_get (i - 1); + strncpy (lastnam, llo->devnam, 3); + strncpy (prevnam, plo->devnam, 3); + len += sprintf (buf + len, "Device %s-%s:\tClosed\n", + lastnam, prevnam); + } + + ENBD_PROC_LABEL (21); + + // PTB re-init vital statistics for next time + next_label = NULL; + next_label_name = NULL; + + // PTB we have no more data + *eof = 1; + *start = (char*)1; + total += len; + + return len; +} + +/* + * PTB read an int from a string. Return number of ints read (0 or 1). + */ +static int +sscani (char *buf, int len, int *n) +{ + + int i, a = 0; + short has_digits = 0; + short is_signed = 0; + + // PTB look for first significant character + for (i = 0; i < len; i++) { + char c = buf[i]; + if (c == ' ' || c == '\t') { + if (is_signed) + return 0; + } else if (c == '-') { + if (is_signed) + return 0; + is_signed = -1; + } else if (c == '+') { + if (is_signed) + return 0; + is_signed = 1; + } else if (c >= '0' && c <= '9') { + is_signed = 1; + has_digits = 1; + break; + } else { + return 0; + } + } + // PTB i now points at first digit if there is one + if (!has_digits) + return 0; + for (; i < len; i++) { + char c = buf[i]; + if (c < '0' || c > '9') + break; + a *= 10; + a += c - '0'; + } + if (is_signed >= 0) { + *n = a; + } else { + *n = -a; + } + return 1; +} + +/* + * look for a 1 or 2 letter device code ("a" or "aa") and save the + * device number to which it refers. Return number of device letter + * codes found (0 or 1). + */ +static int +sscana (char *buf, int len, int *n) +{ + + int i, a = 0; + short has_letters = 0; + + for (i = 0; i < len; i++) { + char c = buf[i]; + if (c >= 'a' && c <= 'z') { + has_letters = 1; + break; + } else if (c == ' ') { + if (has_letters) + return 0; + } else { + return 0; + } + } + if (!has_letters) + return 0; + for (; i < len; i++) { + char c = buf[i]; + if (c < 'a' || c > 'z') + break; + a *= 26; + a += c - 'a'; + } + *n = a; + return 1; +} + +/* + * read an integer (or 2-letter ascii) arg into an int. Return numner + * of integers read (0 or 1) and -1 for no keymatch. The first arg is a + * preceding key. + * @i is the integer value that results + * @j is an index if one one supplied (foo[j] = i ), else -1 + */ +static int +getarg (const char *buffer, int buflen, const char *key, int *i, int *j) +{ + + int keylen; + + void skip_ws (void) { + while (buflen > 0) { + if (*buffer != ' ' && *buffer != '\t') + break; + buffer++; + buflen--; + } + }; + + skip_ws (); + + keylen = strlen (key); + if (strncmp (buffer, key, keylen)) + return -1; + + buffer += keylen; + buflen -= keylen; + + skip_ws (); + + *j = -1; + if (*buffer == '[') { + char *closing; + int indexlen; + + buffer++; + buflen--; + + skip_ws (); + + closing = strchr (buffer, ']'); + if (!closing) + return -1; + indexlen = closing - buffer; + *closing = 0; + + if (sscani ((char *) buffer, indexlen, j) < 1) + return 0; + if (sscana ((char *) buffer, buflen, j) < 1) + return 0; + + buffer = closing; + buflen -= indexlen; + + buffer++; + buflen--; + + skip_ws (); + } + + if (*buffer != '=') + return -1; + + buffer++; + buflen--; + + skip_ws (); + + if (sscani ((char *) buffer, buflen, i) < 1) + return 0; + if (sscana ((char *) buffer, buflen, i) < 1) + return 0; + return 1; +} + +/* + * PTB - write a 0 with echo -n 0 to /proc/nbdinfo to do a hard reset. + */ +static int +enbd_write_proc (struct file *file, const char *buffer, unsigned long count, + void *data) +{ + + switch (count) { + + int i; + + case 2: + if (buffer[1] != '\n') + break; + /* else fallthru to case 1 */ + case 1: + switch (*buffer) { + case '1': + for (i = 0; i < MAX_NBD; i++) { + struct enbd_device *lo = enbd_get(i); + lo->hard_reset (lo); + } + break; + case '0': + for (i = 0; i < MAX_NBD; i++) { + // PTB this takes the io spinlock and our spinlock. + struct enbd_device *lo = enbd_get(i); + lo->soft_reset (lo); + lo->reenable_delay(lo, 5); + } + break; + } + break; + default: + do { + int index; + int merge_requests; + int sync_intvl; + int show_errs; + int md5sum; +#ifndef NO_BUFFERED_WRITES + int buffer_writes; +#endif + int enable; + int direct; + int reset; + int soft_reset; + int hard_reset; + + if (getarg (buffer, count, "merge_requests", + &merge_requests, &index) >= 0) { + // merge_requests + set_merge_requests (merge_requests, index); + break; + } + if (getarg (buffer, count, "sync_intvl", + &sync_intvl, &index) >= 0 + || getarg (buffer, count, "sync", + &sync_intvl, &index) >= 0) { + // sync_intvl + set_sync_intvl (sync_intvl, index); + break; + } + if (getarg (buffer, count, "show_errs", + &show_errs, &index) >= 0) { + // show_errs + set_show_errs (show_errs, index); + break; + } + if (getarg (buffer, count, "md5sum", + &md5sum, &index) >= 0) { + // md5sum + set_md5sum (md5sum, index); + break; + } +#ifndef NO_BUFFERED_WRITES + if (getarg (buffer, count, "buffer_writes", + &buffer_writes, &index) >= 0) { + // buffer_writes + set_buffer_writes (buffer_writes, index); + break; + } +#endif /* NO_BUFFERED_WRITES */ + if (getarg (buffer, count, "enable", + &enable, &index) >= 0) { + // enable + set_enable (enable, index); + break; + } + if (getarg (buffer, count, "direct", + &direct, &index) >= 0) { + // enable + set_direct(direct, index); + break; + } + if (getarg (buffer, count, "reset", + &reset, &index) >= 0) { + // reset + do_reset(reset, index); + break; + } + if (getarg (buffer, count, "soft_reset", + &soft_reset, &index) >= 0) { + // reset + do_soft_reset(soft_reset, index); + break; + } + if (getarg (buffer, count, "hard_reset", + &hard_reset, &index) >= 0) { + // reset + do_hard_reset(soft_reset, index); + break; + } + ENBD_ERROR ("illegal %ld character command\n", + count); + return -EINVAL; + } while (0); + break; + } + return count; +} + +void +enbd_init_proc(struct proc_dir_entry *res) { + res->read_proc = enbd_read_proc; + res->write_proc = enbd_write_proc; +} + --- linux-2.6.7/drivers/block/enbd/enbd_seqno.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_seqno.c Thu Mar 18 17:41:52 2004 @@ -0,0 +1,59 @@ +#include +#include + + +/* + * PTB increment the devices seqno + * + * @lo the nbd device to increment the seqno of + */ +static void +seqno_inc(struct enbd_seqno *nseqno) +{ + if (++nseqno->seqno) + return; + + // PTB next generation ! + atomic_inc (&nseqno->seqno_gen); +} +static int +seqno_get (struct enbd_seqno *nseqno) +{ + return nseqno->seqno; +} +static void +seqno_reset (struct enbd_seqno *nseqno) +{ + nseqno->seqno = 0; + atomic_set(&nseqno->seqno_gen,0); +} + +/* + * PTB convert a seqno number into one with an extra generation number + * in the msb, so that it can be compared with others. return the + * result. + * + * We add the current generation no. to small seqnos, and we add the + * previous generation no. to large seqnos. + * + * @lo the nbd device to look at + * @seqno the small sequence number to return the full seq number for + */ +static unsigned int +seqno_calc (struct enbd_seqno *nseqno, unsigned int seqno) +{ + return seqno; +} + +void enbd_init_seqno (struct enbd_seqno *nseqno) { + + seqno_reset(nseqno); + + nseqno->inc = seqno_inc; + nseqno->get = seqno_get; + nseqno->reset = seqno_reset; + nseqno->calc = seqno_calc; +} + + + --- linux-2.6.7/drivers/block/enbd/enbd_speed.c.pre-enbd +++ linux-2.6.7/drivers/block/enbd/enbd_speed.c Mon Jan 19 03:30:08 2004 @@ -0,0 +1,65 @@ +#include +#include +#include + +/* + * PTB - update speed counters (if at least 5s has passed) + * + * @spd the speed struct to update + */ +static void +spd_update (struct enbd_speed *spd, int distance) +{ + + // last time we measured + int lastjiffy = atomic_read (&spd->jiffy); + // jiffies since last time + int djiffy = jiffies - lastjiffy; + + // previous no we measured + int lastdist = atomic_read (&spd->distance); + // blocks since last time + int ddistance = distance - lastdist; + + // write every 5 second in time + if (djiffy > 5 * HZ) { + + // jiffies since first time + int tjiffy = jiffies - atomic_read (&spd->frstj); + + // max tot speed measured so far + int speedmax = atomic_read (&spd->speedmax); + + // last instantaneous speed we measured + int lastspeed = atomic_read (&spd->speed); + + // instantaneous read blocks/s + int speed = djiffy ? (ddistance * HZ) / djiffy : 0; + + // smoothed KB/s + int speedsmoothed = + (djiffy * speed + HZ * lastspeed) / (djiffy + HZ); + + // average speed to now in KB/s + int speedav = tjiffy ? (distance * HZ) / tjiffy : 0; + + // smoothing count for max + int speedhi = + (speedav > speedsmoothed) ? speedav : speedsmoothed; + + // doing settings + atomic_set (&spd->speed, speedsmoothed); + if (speedhi > speedmax) + atomic_set (&spd->speedmax, speedhi); + atomic_set (&spd->distance, distance); + atomic_set (&spd->speedav, speedav); + atomic_set (&spd->jiffy, jiffies); + } +} + +void +enbd_init_speed(struct enbd_speed *spd) { + memset(spd, 0, sizeof(*spd)); + spd->update = spd_update; +} + --- linux-2.6.7/include/linux/enbd.h.pre-enbd +++ linux-2.6.7/include/linux/enbd.h Tue Jun 15 22:35:08 2004 @@ -0,0 +1,509 @@ +#ifndef LINUX_ENBD_H +#define LINUX_ENBD_H + +/* unsigned comments are Pavel's originals for 2.1.* + * pavel@atrey.karlin.mff.cuni.cz (Pavel Machek) + * comments marked PTB are from + * ptb@it.uc3m.es (Peter T. Breuer) + * comments marked AMARIN are from + * amarin@it.uc3m.es (Andres Marin Lopez) + */ + +#include + +#ifndef ENBD_VERSION +#define ENBD_VERSION "2.4.30 $Date: 2002/09/17 16:33:22 $" +#endif /*ENBD_VERSION*/ + + /* + * Third type of request apart from READ or WRITE + */ + #ifndef IOCTL + # define IOCTL 2 + #endif + /* + * and fourth .. + */ + #ifndef MD5SUM + # define MD5SUM 3 + #endif + + #define NRQ_TYPES 4 + +/* PTB - new style ioctl assignments */ + #define ENBD_SET_SOCK _IOW(0xab, 0x00, int) + #define ENBD_TEST_IOCTL1 _IOW(0xab, 0x01, int) + #define ENBD_SET_SIZE _IOW(0xab, 0x02, int) + #define ENBD_DO_IT _IOW(0xab, 0x03, int) + #define ENBD_CLEAR_SOCK _IOW(0xab, 0x04, int) + #define ENBD_CLEAR_QUE _IO (0xab, 0x05) + #define ENBD_PRINT_DEBUG _IO (0xab, 0x06) + #define ENBD_TEST_IOCTL2 _IOR(0xab, 0x07, int) + #define ENBD_HARD_RESET _IO (0xab, 0x09) + #define ENBD_DEC_USE_COUNT _IO (0xab, 0x09) + #define MY_NBD_ACK _IOW(0xab, 0x0a, char *) + #define MY_NBD_GET_REQ _IOW(0xab, 0x0b, char *) + #define MY_NBD_REG_BUF _IOW(0xab, 0x0c, char *) + #define MY_NBD_CLR_REQ _IOW(0xab, 0x0d, int) + #define MY_NBD_SYNC _IOW(0xab, 0x0e, int) + #define ENBD_SET_SECTORS _IOW(0xab, 0x0f, int) + #define MY_NBD_SET_SIG _IOW(0xab, 0x10, int *) + #define ENBD_RESET _IO (0xab, 0x11) + #define ENBD_TEST_IOCTL3 _IOWR(0xab, 0x12, int) + #define MY_NBD_ERR_REQ _IOW(0xab, 0x13, int) + #define MY_NBD_SET_INTVL _IOW(0xab, 0x14, int) + #define MY_NBD_SET_SHOW_ERRS _IOW(0xab, 0x15, int) + #define ENBD_SET_MD5SUM _IOW(0xab, 0x16, int) + #define MY_NBD_SET_BUFFERWR _IOW(0xab, 0x17, int) + #define MY_NBD_INVALIDATE _IOW(0xab, 0x18, int) + #define MY_NBD_SET_SPID _IOW(0xab, 0x19, int) + #define MY_NBD_SET_RQ_HANDLE _IOW(0xab, 0x1a, void*) + #define MY_NBD_SET_RQ_SEQNO _IOW(0xab, 0x1b, int) + #define MY_NBD_SET_RQ_DIGEST _IOWR(0xab, 0x1d, enbd_digest_t) + #define ENBD_TEST_IOCTL4 _IOR(0xab, 0x1e, char[256]) + #define ENBD_TEST_IOCTL5 _IOWR(0xab, 0x1f, char[256]) + #define ENBD_TEST_IOCTL6 _IO(0xab, 0x20) // special r 256B + #define ENBD_TEST_IOCTL7 _IO(0xab, 0x21) // special rw 256B + #define ENBD_SET_BLKSIZE _IOW(0xab, 0x22, int) + #define ENBD_GET_BLKSIZE _IOR(0xab, 0x23, long) + #define ENBD_SET_PF_MEMALLOC _IOW(0xab, 0x24, int) + #define MY_NBD_SET_DIRECT _IOW(0xab, 0x25, int) + #define MY_NBD_GET_NPORT _IOR(0xab, 0x26, int) + #define ENBD_SETFAULTY _IOW(0xab, 0x27, int) + #define ENBD_HOTREMOVE _IOW(0xab, 0x28, int) + #define ENBD_HOTADD _IOW(0xab, 0x29, int) + #define ENBD_REMOTE_CHECK _IOW(0xab, 0x2a, int) + #define ENBD_REMOTE_REVALIDATE _IOW(0xab, 0x2b, int) + #define ENBD_GET_MAXCONN _IOR(0xab, 0x2c, int) + #define MY_NBD_SEND_REQ _IOW(0xab, 0x2d, char *) + #define MY_NBD_RECV_RPL _IOW(0xab, 0x2e, char *) + +#define MAX_NBD 16 /* PTB MAX was 128, but that's a lot */ +#define ENBD_SHIFT 4 /* PTB 16 partitions/sockets/slots per device */ + /* PTB number of socket slots per device */ +#define ENBD_MAXCONN (1< + + #define ENDREQ_NOCURRENT + #define LOCAL_END_REQUEST + #include + + #define ENBD_SYSCTL_MAX 16 /* PTB number of sysctl entries */ + + /* PTB various defaults */ + #define ENBD_RAHEAD_DFLT 24 /* PTB slow medium */ + #define ENBD_SYNC_INTVL 0 /* PTB sync every nK reqs (default disable) */ + #define ENBD_REQ_TIMEO 5 /* PTB client inactivity chk intvl (rollback) */ + #define ENBD_SPEED_LIM 100000 /* PTB limit to 100M write reqs/s */ + #define ENBD_MERGE_REQ_DFLT 0 /* PTB until accounting fixed! */ + /* PTB Jens Axboe says that plug should always be set in 2.4.* */ + #define ENBD_PLUG_DFLT 1 + #define ENBD_MD5SUM_DFLT 0 + +/* + * PTB User messaging defs. + */ + + #define ENBD_ID "ENBD #%d[%d]: %s " + + #define ENBD_DEBUG(level, s...) \ + { static int icnt; printk( KERN_DEBUG ENBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + #define ENBD_ERROR( s...) \ + { static int icnt; if (icnt++ % 1000 < 3) printk( KERN_ERR ENBD_ID, __LINE__, icnt, __FUNCTION__); printk(s);} + #define ENBD_ALERT( s...) \ + { static int icnt; printk( KERN_ALERT ENBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + #define ENBD_INFO( s...) \ + { static int icnt; printk( KERN_INFO ENBD_ID, __LINE__, icnt++, __FUNCTION__); printk(s);} + + + + struct enbd_slot { + struct file * file; /* PTB add - for refcnt, NULL if slot empty */ + struct socket * sock; /* PTB add */ + int in; /* PTB add - tot blocks entered */ + int out; /* PTB add - tot blocks released */ + int err; /* PTB add - tot blocks errored */ + int req; /* PTB add - tot blocks pending */ + char * buffer; /* PTB add - user space buffer */ + int bufsiz; /* PTB add - user space buffer size */ + struct list_head queue; + unsigned long req_age; /* PTB add - age of pending req */ + unsigned long cli_age; /* PTB add - age of client */ + struct enbd_device *lo; /* PTB add - parent device */ + #define ENBD_SLOT_RUNNING 0x0001 + #define ENBD_SLOT_WAITING 0x0002 + #define ENBD_SLOT_BUFFERED 0x0004 + #define ENBD_SLOT_MD5SUM 0x8000 /* slot reply has a digest in it ..*/ + #define ENBD_SLOT_MD5_OK 0x10000 /* .. and equaled req's */ + #define ENBD_SLOT_SOCKET 0x20000 /* use socket for send/recv */ + atomic_t flags; /* PTB add */ + int i; /* PTB add - slot number */ + int buflen; /* PTB add - buffer byte count */ + int pid; /* PTB add - client process */ + int refcnt; /* PTB add - so can set_sock/clr_sock ourself */ + atomic_t nerrs; /* PTB add - local error count */ + int spid; /* PTB add - server pid */ + int md_count; /* PTB add - really partition-in-raid count */ + spinlock_t lock; /* PTB add - so clr_sock/get_req/ack safely */ + }; + + struct enbd_md; + struct enbd_md { + int count; + int doing_notify; + int notify_pid; + spinlock_t access_lock; + int (*notify_fn)(dev_t, int); + int (*notify)(struct enbd_md *, dev_t, int cmd); + int (*dec)(struct enbd_md *); + int (*inc)(struct enbd_md *); + int (*reg)(struct enbd_md *, int(*)(dev_t, int)); + }; + + struct enbd_speed { + atomic_t speed; /* PTB add - current speed in KB/s */ + atomic_t speedmax; /* PTB add - max speed */ + atomic_t speedav; /* PTB add - average speed */ + atomic_t distance; /* PTB add - last distance measure */ + atomic_t jiffy; /* PTB add - last jiffies speed set */ + atomic_t frstj; /* PTB add - first jiffies */ + void (*update)(struct enbd_speed*, int); + }; + + struct enbd_ioctl_info { + int cmd; + unsigned long arg; + int size; + struct completion x; + int errors; + char * buffer; + }; + + struct enbd_md_list { + struct list_head list; + dev_t dev; + }; + + struct enbd_seqno; // forward decl + struct enbd_seqno { + unsigned int seqno; /* PTB add - sequence number */ + atomic_t seqno_gen; /* PTB add - seqno genration */ + void (*inc)(struct enbd_seqno *); + int (*get)(struct enbd_seqno *); + void (*reset)(struct enbd_seqno *); + unsigned (*calc)(struct enbd_seqno *, unsigned); + }; + + struct enbd_device { + atomic_t refcnt; + + #define ENBD_READ_ONLY 0x0001 + #define ENBD_WRITE_NOCHK 0x0002 + #define ENBD_INITIALISED 0x0004 + #define ENBD_SIGNED 0x0008 + + #define ENBD_ENABLED 0x0010 + #define ENBD_SIZED 0x0020 + #define ENBD_BLKSIZED 0x0040 + + + #define ENBD_VALIDATING 0x0100 + #define ENBD_SHOW_ERRS 0x0200 + #define ENBD_SYNC 0x0400 + #define ENBD_VALIDATED 0x0800 /* read partition table */ + + #define ENBD_BUFFERWR 0x1000 /* buffer writes to device */ + #define ENBD_REMOTE_INVALID \ + 0x2000 /* remote resource vanished */ + #define ENBD_DIRECT 0x4000 /* convert opens to O_DIRECT */ + #define ENBD_MD5SUM 0x8000 + #define ENBD_RAID_SHOW_ERRS \ + 0x10000 /* set show_errs for raid */ + #define ENBD_SET_SHOW_ERRS \ + 0x20000 /* set show_errs for remote inval */ + #define ENBD_MD5SUM_NOAUTO \ + 0x40000 /* enable/disable md5sums */ + + atomic_t flags; + int harderror; /* Code of hard error */ + int magic; /* FIXME: not if debugging is off */ + struct list_head queue; + rwlock_t queue_lock; /* PTB add - spinlock */ + int nslot; /* PTB add - total slots */ + atomic_t islot; /* PTB add - current slot */ + int aslot; /* PTB add - total active slots*/ + atomic_t requests_in[NRQ_TYPES]; /* PTB add - blocks put on queue */ + atomic_t requests_out[NRQ_TYPES]; /* PTB add - blocks out from queue */ + atomic_t requests_err; /* PTB add - blocks erred on queue */ + atomic_t wrequests_5so; /* PTB add - write blocks md5 skip */ + atomic_t wrequests_5wo; /* PTB add - write blocks md5 wr */ + atomic_t wrequests_5eo; /* PTB add - write blocks md5 refus*/ + atomic_t wrequests_5to; /* PTB add - write blocks md5sum */ + atomic_t wrequests_5co; /* PTB add - write blocks md5 tot */ + atomic_t wrequests_5no; /* PTB add - write blocks not md5 */ + atomic_t requests_req[NRQ_TYPES]; /* PTB add - read blocks pending */ + atomic_t kwaiters; /* PTB add - kernel thrds waiting */ + atomic_t kthreads; /* PTB add - kernel threads in */ + atomic_t maxq[NRQ_TYPES]; /* PTB add - max req queue depth */ + atomic_t countq[NRQ_TYPES]; /* PTB add - request queue depth */ + atomic_t errors; /* PTB add - tot requests errored */ + struct enbd_seqno seqno_out; /* PTB add - seq number */ + atomic_t cwaiters; /* PTB add - client thrds waiting */ + atomic_t cthreads; /* PTB add - client threads in */ + atomic_t req_in[NRQ_TYPES][1 + ENBD_MAX_SECTORS/2]; + wait_queue_head_t wq; /* PTB add */ + struct enbd_slot slots[ENBD_MAXCONN]; /* PTB add - client array */ + unsigned blksize; /* PTB add - device blksize in B */ + u64 bytesize; /* PTB add - device size in B */ + u64 sectors; /* PTB add - device size (sectors) */ + unsigned size; /* PTB add - device size in blks */ + unsigned logblksize; /* PTB add - log2 blksize */ + unsigned nbd; /* PTB add - this array index */ + int signature[ENBD_SIGLEN/sizeof(int)]; + /* PTB add - server sig */ + struct file * file; /* PTB add - for ref */ + struct inode * inode; /* PTB add - for ref */ + int bufsiz; /* PTB add - userspace buffer size */ + atomic_t kmax; /* PTB add - max kernel threads */ + char *blockmap; /* PTB add - map of block states */ + unsigned long disabled; /* PTB add - when was it disabled */ + int req_timeo; /* PTB add - inactivity timeout */ + struct timer_list run_queue; /* PTB add - run queue */ + struct work_struct task_queue; /* PTB add - task queue */ + char devnam[4]; /* PTB add - drive letters */ + atomic_t maxreqblks; /* PTB add - maximum req size seen */ + int max_sectors; /* PTB add - max req size allowed! */ + int lives; /* PTB add - # times enabled */ + // PTB speed measurement settings + struct enbd_speed tspeed; + struct enbd_speed wspeed; + struct enbd_speed rspeed; + int dummy; /* PTB add - unused */ + struct request *req; /* PTB fake request for ioctls */ + wait_queue_head_t req_wq; /* PTB req done notifications */ + struct request *rq; /* PTB special request ptr */ + atomic_t seqno_in; /* PTB add - unacked reqs */ + struct semaphore pid_sem; /* PTB control setting pid */ + struct gendisk *disk; /* PTB for partitions */ + struct request_queue *q; /* PTB make queue internal */ + rwlock_t meta_lock; /* PTB add - spinlock meta data */ + atomic_t merge_requests; /* PTB local req blks limit - 1 */ + atomic_t md_count; /* PTB count of raids we are in */ + unsigned long reenable_time; /* PTB time to delayed reenable */ + unsigned long last_checked; /* PTB media_check last called */ + int md5_off_threshold; /* PTB autolimits on md5 */ + int md5_on_threshold; /* PTB autolimits on md5 */ + spinlock_t lock; /* PTB kernel req queue lock */ + struct ctl_table systable[ENBD_SYSCTL_MAX]; + + void (*enable) (struct enbd_device *lo); + void (*reset) (struct enbd_device *lo); + int (*disable) (struct enbd_device *lo); + int (*read_only) (struct enbd_device *lo); + void (*set_speed) (struct enbd_device *lo); + int (*hard_reset)(struct enbd_device *lo); + int (*soft_reset)(struct enbd_device *lo); + int (*reenable_delay) (struct enbd_device *lo, int delay); + // PTB we steal these from the queue struct at init + merge_requests_fn *ll_merge_requests_fn; + merge_request_fn *ll_front_merge_fn; + merge_request_fn *ll_back_merge_fn; + + }; + +#endif /* MAJOR_NR */ + + + +/* Pavel - This now IS in some kind of include file... */ + +/* PTB 132 */ +#define ENBD_INIT_MAGIC 0x12345678 /* AMARIN */ +#define ENBD_REQUEST_MAGIC 0x25609513 +#define ENBD_REPLY_MAGIC 0x67446698 +/* Pavel - Do *not* use magics: 0x12560953 0x96744668. + */ + +#define ENBD_DEV_MAGIC 0x68797548 + +#define ENBD_REQUEST_MAGIC_T __u32 +#define ENBD_REQUEST_TYPE_T __u32 +#define ENBD_REQUEST_FROM_T __u64 +#define ENBD_REQUEST_LEN_T __u32 +#define ENBD_REQUEST_FLAGS_T __u32 +#define ENBD_REQUEST_TIME_T __u64 +#define ENBD_REQUEST_ZONE_T __u64 +#define ENBD_REQUEST_SPECIAL_T __u32 + +#define ENBD_REPLY_MAGIC_T __u32 +#define ENBD_REPLY_ERROR_T __s32 +#define ENBD_REPLY_FLAGS_T __u32 +#define ENBD_REPLY_TIME_T __u64 +#define ENBD_REPLY_ZONE_T __s64 + +#define ENBD_REQUEST_HANDLE_T __u32 +#define ENBD_REPLY_HANDLE_T __u32 + + typedef __u32 enbd_digest_t[4]; + +#define ENBD_DIGEST_T enbd_digest_t + +#define ENBD_REQUEST_DIGEST_T enbd_digest_t +#define ENBD_REPLY_DIGEST_T enbd_digest_t + +#define ENBD_DIGEST_BITS 128 +#define ENBD_DIGEST_LENGTH ((ENBD_DIGEST_BITS)/8) +#define ENBD_REQUEST_SEQNO_T __u32 + +struct enbd_request { + + ENBD_REQUEST_MAGIC_T magic; + ENBD_REQUEST_TYPE_T type; /* == READ || == WRITE */ + ENBD_REQUEST_HANDLE_T handle; + ENBD_REQUEST_FROM_T from; /* 64 bit PTB 132 */ + ENBD_REQUEST_LEN_T len; + +#define ENBD_REQUEST_ERRORED 0x0800 +#define ENBD_REQUEST_MD5SUM 0x8000 /* has a digest in it ..*/ +#define ENBD_REQUEST_MD5_OK 0x10000 /* .. and equaled req's */ +#define ENBD_REQUEST_IOCTL 0x40000 /* ioctl in len, arg in from */ +#define ENBD_REQUEST_SPECIALRW 0x80000 /* 1 for w 0 for r on special */ + + ENBD_REQUEST_FLAGS_T flags; + ENBD_REQUEST_TIME_T time; + ENBD_REQUEST_ZONE_T zone; + ENBD_REQUEST_SEQNO_T seqno; + union { + ENBD_REQUEST_DIGEST_T digest; + struct { + int (*send_data_from_req) (struct enbd_request * request, + struct request * req, int size); + } method; + } data; + ENBD_REQUEST_SPECIAL_T special; + char dummy0[0]; + char dummy1[0] __attribute__ ((aligned (64))); +} __attribute__ ((packed)) ; + + #define ENBD_REQUEST_LENGTH sizeof(struct enbd_request) + +struct enbd_reply { + ENBD_REPLY_MAGIC_T magic; + ENBD_REPLY_ERROR_T error; /* 0 = ok, else error */ + ENBD_REPLY_HANDLE_T handle; /* handle you got from request */ + +#define ENBD_REPLY_ERRORED 0x0800 +#define ENBD_REPLY_MD5SUM 0x8000 /* has a digest in it .. */ +#define ENBD_REPLY_MD5_OK 0x10000 /* .. and equaled req's */ +#define ENBD_REPLY_CLOSE 0x20000 /* close cmd from server */ +#define ENBD_REPLY_IOCTL 0x40000 /* ioctl in len, arg in from */ + + ENBD_REPLY_FLAGS_T flags; + ENBD_REPLY_TIME_T time; + ENBD_REPLY_ZONE_T zone; + union { + ENBD_REPLY_DIGEST_T digest; + struct { + int (*recv_data_to_req) (struct enbd_reply * reply, + struct request *req, int size); + } method; + } data; + char dummy0[0]; + char dummy1[0] __attribute__ ((aligned (64))); +} __attribute__ ((packed)) ; + + #define ENBD_REPLY_LENGTH sizeof(struct enbd_reply) + + #define ENBD_BUFFER_DATA_OFFSET \ + ((ENBD_REQUEST_LENGTH>ENBD_REPLY_LENGTH)?ENBD_REQUEST_LENGTH:ENBD_REPLY_LENGTH) + + #ifdef MAJOR_NR + + // PTB forward declaration + static struct enbd_device enbd_dev[]; + + static void local_nbd_end_request(struct request *req, int uptodate) { + + struct bio *bio; + static int rq_type(struct request *); + int type = rq_type(req); + + if (type == IOCTL) { + complete(req->waiting); + // PTB let the driver code put_ the req, so that it can recover info + return; + } + + /* unlock chained buffers */ + while ((bio = req->bio) != NULL) { + unsigned nsect = bio_sectors(bio); + blk_finished_io(nsect); + req->bio = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio, nsect << 9, uptodate ? 0 : -EIO); + } + + } + + /* + * PTB This takes the spinlock itself! So call it with the io spinlock + * not held. + */ + static void local_nbd_end_request_lock(struct request *req, int uptodate) { + + unsigned long flags; + request_queue_t *q = req->q; + + spin_lock_irqsave(q->queue_lock, flags); + local_nbd_end_request(req, uptodate); + spin_unlock_irqrestore(q->queue_lock, flags); + } + + /* + * PTB Call this only with the io spinlock * held. + */ + static inline void enbd_end_request(struct request *req) { + + // PTB the kernel has only 2 queues, read and write, and it uses + // the cmd field to determine to which the req belongs. We add a + // seqno to it in enbd_do_req, so we reestablish it here. + static void rq_set_seqno(struct request *, int); + + rq_set_seqno(req, 0); // PTB Zero extra seqno info + local_nbd_end_request( req, (req->errors == 0) ? 1 : 0 ); + } + + /* + * PTB This takes the spinlock itself! So call it with the io spinlock + * not held. + */ + static void enbd_end_request_lock(struct request *req) { + + // PTB the kernel has only 2 queues, read and write, and it uses + // the cmd field to determine to which the req belongs. We add a + // seqno to it in enbd_do_req, so we reestablish it here. + static void rq_set_seqno(struct request *, int); + + rq_set_seqno(req, 0); // PTB Zero extra seqno info + local_nbd_end_request_lock( req, !req->errors ); + } + + extern int enbd_init_seqno(struct enbd_seqno *); + extern int enbd_init_speed(struct enbd_speed *); + extern int enbd_init_md(struct enbd_md *); + extern void enbd_init_proc(struct proc_dir_entry *res); + + #endif /* MAJOR_NR */ + +#endif /* LINUX_ENBD_H */ + + --- linux-2.6.7/include/linux/enbd_ioctl.h.pre-enbd +++ linux-2.6.7/include/linux/enbd_ioctl.h Fri Apr 9 23:50:02 2004 @@ -0,0 +1,57 @@ +#ifndef ENBD_IOCTL_H +#define ENBD_IOCTL_H 1 + +//int enbd_ioctl_convert(int ioctl); +//int enbd_ioctl_convert_inplace(int *ioctl); +//int enbd_ioctl_revert(int ioctl); +//int enbd_ioctl_size (int cmd, char *arg); +//int enbd_ioctl_size_user (int cmd, char *arg); + +#ifdef __KERNEL__ +//static int enbd_ioctl_copy_to_user (int cmd, char *arg, char *buf, int size); +//static int enbd_ioctl_copy_from_user (int cmd, char *buf, char *arg, int size); + +/* + * PTB object containing all the above methods, to be registered with + * the enbd.o module + */ +struct enbd_ioctl { +#define ENBD_REMOTE_IOCTL_ENABLED 0x01 + unsigned long flags; + int (*convert) (int ioctl); + int (*convert_inplace)(int *ioctl); + int (*revert) (int ioctl); + int (*size) (int cmd, char *arg); + int (*size_user) (int cmd, char *arg); + int (*cp_to_user) (int cmd, char *arg, char *buf, int size); + int (*cp_from_user) (int cmd, char *buf, char *arg, int size); +}; + +struct enbd_ioctl_stub { + struct enbd_ioctl * remote; + int (*reg) (struct enbd_ioctl_stub *,struct enbd_ioctl *); + int (*unreg) (struct enbd_ioctl_stub *,struct enbd_ioctl *); +}; + +extern struct enbd_ioctl_stub enbd_remote_ioctl; +extern int enbd_init_ioctl_stub(struct enbd_ioctl_stub *); +#endif + +// PTB conversion table entries +struct ioctl_conv { + unsigned int old; // ioctl id, _IO or _IOR or _IOW or _IOWR + unsigned int new; // ioctl id +}; + +// PTB extended conversion table entries +struct ioctl_special { + int new; + int (*size) (char *arg); + int (*size_user) (char *arg); + int (*ioctl_copy_from_user)(char *buf, char*arg, int size); + int (*ioctl_copy_to_user)(char *arg, char*buf, int size); +}; + +extern int enbd_init_ioctl_stub(struct enbd_ioctl_stub *); + +#endif /* ENBD_IOCTL_H */ --- linux-2.6.7/include/linux/enbd_ioctl_table.h.pre-enbd Fri Apr 9 23:39:07 2004 +++ linux-2.6.7/include/linux/enbd_ioctl_table.h Sun Aug 8 12:33:14 2004 @@ -0,0 +1,201 @@ +#ifndef ENBD_IOCTL_TABLE_H +#define ENBD_IOCTL_TABLE_H 1 + +/* + * PTB the space before the final comma is important as the ## + * discards the preceding token when D is empty + */ +#define _NEW_IO_(B,C,D...) C(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IO(B,D...) _IO(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOW(B,D...) _IOW(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOR(B,D...) _IOR(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IOWR(B,D...) _IOWR(_IOC_TYPE(B), _IOC_NR(B) , ## D) +#define _NEW_IORS(B) _IOC(_IOC_READ,_IOC_TYPE(B), _IOC_NR(B), _IOC_SIZEMASK) +#define _NEW_IOWRS(B) _IOC(_IOC_READ|_IOC_WRITE,_IOC_TYPE(B), _IOC_NR(B), _IOC_SIZEMASK) + +/* + * This is the whitelist of remote ioctls - an entry here tells the + * driver that it's OK to send this ioctl out over the net, because we + * have the right info on it. + * + * "The right info" is what is on the right hand side of the table (a 0 + * stands for a repetition of the LHS info). We have to fixup something + * that a lot of kernel authors forgot to do or got worng - namely + * declare their ioctls in a way that conveys information about their + * intended mode of use (see iotcl.h in the kernel sources). + * + * We need all ioctls to be delared as either + * + * _IO(class,id) -- default. Means no args. The call is enough. + * _IOW(class,id,type) -- we write a value to kernel that is sizeof(type) + * _IOR(class,id,type) -- we read a value from kernel sizeof(type) + * _IOWR(class,id,type) -- ibid, but both ways + * + * The "R" bit is crucial because it lets us know that the data is + * _indirected_. I.e. it's an address of somewhere in userspace where + * we want to read data from or write data to. + * + * The "type" part should be the type of the indirected argument, NOT + * the type of its address! + * + * Kernel authors typically make two mistakes: + * + * 1) they write _IO instead of _IOR or IOWR, and hence forget the + * type info. Well, not telling me if the argument data is + * direct or indirectly accessible was already bad enough! + * 2) they get the type argument _wrong_ when they do remember to + * put it. They write "int *" instead of "int", for example, + * when the argument to the ioctl is a pointer to an integer. + * OK, so it's a natural mistake to make! But in that case the + * argument should be "int" so that the kernel macro picks up + * sizeof(int) instead of sizeof(int*). + * + * Those "errors" have to be repaired via this table. Wrong at left, + * corrected at right. A 0 for the new entry indicates that the old + * was alright. If there isn't an entry, the ioctl won't be treated. + * If the size info works out at the max for the field (2^14 - 1) + * then a extra table is consulted for size and copy methods. + */ +static struct ioctl_conv +ioctl_conv_tab[] = { +// fs.h + { BLKROSET, _NEW_IOW(BLKROSET,int), }, + { BLKROGET, _NEW_IOR(BLKROGET,int), }, + { FIBMAP, _NEW_IOR(FIBMAP,int), }, +//#define BLKRRPART _IO(0x12,95) /* re-read partition table */ + { BLKRRPART, 0, }, + { BLKGETSIZE, _NEW_IOR(BLKGETSIZE,int), }, +//#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ + { BLKFLSBUF, 0, }, + { BLKRASET, _NEW_IOW(BLKRASET,int), }, + { BLKRAGET, _NEW_IOR(BLKRAGET,int), }, + { BLKFRASET, _NEW_IOW(BLKFRASET,int), }, + { BLKFRAGET, _NEW_IOR(BLKFRAGET,int), }, + { BLKSECTSET, _NEW_IOW(BLKSECTSET,int), }, + { BLKSECTGET, _NEW_IOR(BLKSECTGET,int), }, + { BLKSSZGET, _NEW_IOR(BLKSSZGET,int), }, +// fd.h + { FDCLRPRM, 0, }, + { FDSETPRM, _NEW_IOWR(FDSETPRM, struct floppy_struct), }, + { FDDEFPRM, _NEW_IOWR(FDDEFPRM, struct floppy_struct), }, + { FDGETPRM, _NEW_IOR(FDGETPRM, struct floppy_struct), }, + { FDMSGON, 0, }, + { FDMSGOFF, 0, }, + { FDFMTBEG, 0, }, + { FDFMTTRK, _NEW_IOWR(FDFMTTRK, struct format_descr), }, + { FDFMTEND, 0, }, + { FDSETEMSGTRESH, _NEW_IOW(FDSETEMSGTRESH, unsigned), }, + { FDFLUSH, 0, }, + { FDSETMAXERRS, _NEW_IOWR(FDSETMAXERRS, struct floppy_max_errors), }, + { FDGETMAXERRS, _NEW_IOR(FDGETMAXERRS, struct floppy_max_errors), }, + { FDGETDRVTYP, _NEW_IOR(FDGETDRVTYP, floppy_drive_name), }, // 16 bytes + { FDSETDRVPRM, _NEW_IOWR(FDSETDRVPRM, struct floppy_drive_params), }, + { FDGETDRVPRM, _NEW_IOR(FDGETDRVPRM, struct floppy_drive_params), }, + { FDGETDRVSTAT, _NEW_IOR(FDGETDRVSTAT, struct floppy_drive_struct), }, + { FDPOLLDRVSTAT, _NEW_IOR(FDPOLLDRVSTAT, struct floppy_drive_struct), }, + { FDRESET, 0, }, + { FDGETFDCSTAT, _NEW_IOR(FDGETFDCSTAT, struct floppy_fdc_state), }, + { FDWERRORCLR, 0, }, + { FDWERRORGET, _NEW_IOR(FDWERRORGET, struct floppy_write_errors), }, + { FDRAWCMD, _NEW_IOWR(FDRAWCMD, struct floppy_raw_cmd[1]) }, // FIXME linked list + { FDTWADDLE, 0, }, + { FDEJECT, 0, }, +// cdrom.h + { CDROMPAUSE, _NEW_IO(CDROMPAUSE), }, + { CDROMRESUME, _NEW_IO(CDROMRESUME), }, + { CDROMPLAYMSF, _NEW_IOR(CDROMPLAYMSF, struct cdrom_msf), }, + { CDROMPLAYTRKIND, _NEW_IOR(CDROMPLAYTRKIND, struct cdrom_ti), }, + { CDROMREADTOCHDR, _NEW_IOWR(CDROMREADTOCHDR, struct cdrom_tochdr), }, + { CDROMREADTOCENTRY, _NEW_IOWR(CDROMREADTOCENTRY, struct cdrom_tocentry), }, + { CDROMSTOP, _NEW_IO(CDROMSTOP), }, + { CDROMSTART, _NEW_IO(CDROMSTART), }, + { CDROMEJECT, _NEW_IO(CDROMEJECT), }, + { CDROMVOLCTRL, _NEW_IOR(CDROMVOLCTRL, struct cdrom_volctrl), }, + { CDROMSUBCHNL, _NEW_IOWR(CDROMSUBCHNL, struct cdrom_subchnl), }, + { CDROMREADMODE2, _NEW_IOR(CDROMREADMODE2, struct cdrom_read), }, // INDIRECT 2336B + { CDROMREADMODE1, _NEW_IOR(CDROMREADMODE1, struct cdrom_read), }, // INDIRECT 2048B + { CDROMREADAUDIO, _NEW_IOR(CDROMREADAUDIO, struct cdrom_read_audio), }, + { CDROMEJECT_SW, _NEW_IO(CDROMEJECT_SW), }, + { CDROMMULTISESSION, _NEW_IOWR(CDROMMULTISESSION, struct cdrom_multisession), }, + { CDROM_GET_MCN, _NEW_IOWR(CDROM_GET_MCN, struct cdrom_mcn), }, + { CDROMRESET, _NEW_IO(CDROMRESET), }, + { CDROMVOLREAD, _NEW_IOWR(CDROMVOLREAD, struct cdrom_volctrl), }, + { CDROMREADRAW, _NEW_IOR(CDROMREADRAW, struct cdrom_read), }, // INDIRECT 2352B + // aztcd.c optcd.c + { CDROMREADCOOKED, _NEW_IOR(CDROMREADCOOKED, struct cdrom_msf), }, // INDIRECT FIXME + { CDROMSEEK, _NEW_IOR(CDROMSEEK, struct cdrom_msf), }, + // scsi-cd.c + { CDROMPLAYBLK, _NEW_IOWR(CDROMPLAYBLK, struct cdrom_blk), }, + // optcd.c + { CDROMREADALL, _NEW_IOR(CDROMREADALL, char[2646]), }, + // ide-cd.c +#ifdef CDROMGETSPINDOWN + { CDROMGETSPINDOWN, _NEW_IOWR(CDROMGETSPINDOWN, char), }, // one byte +#endif +#ifdef CDROMSETSPINDOWN + { CDROMSETSPINDOWN, _NEW_IOWR(CDROMSETSPINDOWN, char), }, // one byte +#endif + // cdrom.c + { CDROMCLOSETRAY, _NEW_IO(CDROMCLOSETRAY), }, + { CDROM_SET_OPTIONS, _NEW_IOW(CDROM_SET_OPTIONS, int), }, + { CDROM_CLEAR_OPTIONS, _NEW_IOW(CDROM_CLEAR_OPTIONS, int), }, + { CDROM_SELECT_SPEED, _NEW_IOW(CDROM_SELECT_SPEED, int), }, // FIXME - don't know + { CDROM_SELECT_DISC, _NEW_IOW(CDROM_SELECT_DISC, int), }, + { CDROM_MEDIA_CHANGED, _NEW_IOW(CDROM_MEDIA_CHANGED, int), }, + { CDROM_DRIVE_STATUS, _NEW_IOW(CDROM_DRIVE_STATUS, int), }, + { CDROM_DISC_STATUS, _NEW_IO(CDROM_DISC_STATUS), }, + { CDROM_CHANGER_NSLOTS, _NEW_IO(CDROM_CHANGER_NSLOTS), }, + { CDROM_LOCKDOOR, _NEW_IOW(CDROM_LOCKDOOR, int), }, + { CDROM_DEBUG, _NEW_IOW(CDROM_DEBUG, int), }, + { CDROM_GET_CAPABILITY, _NEW_IO(CDROM_GET_CAPABILITY), }, + // sbpcd + { CDROMAUDIOBUFSIZ, _NEW_IOW(CDROMAUDIOBUFSIZ, int), }, + // dvd +#ifdef DVD_READ_STRUCT + // PTB SHartley@encentrus.com says it's IOWR, not IOR. + { DVD_READ_STRUCT, _NEW_IOWR(DVD_READ_STRUCT, dvd_struct), }, +#endif +#ifdef DVD_WRITE_STRUCT + { DVD_WRITE_STRUCT, _NEW_IOWR(DVD_WRITE_STRUCT, dvd_struct), }, +#endif +#ifdef DVD_AUTH + { DVD_AUTH, _NEW_IOWR(DVD_AUTH, dvd_authinfo), }, +#endif +#ifdef CDROM_SEND_PACKET + { CDROM_SEND_PACKET, _NEW_IOR(CDROM_SEND_PACKET, struct cdrom_generic_command), }, +#endif +#ifdef CDROM_NET_WRITABLE + { CDROM_NEXT_WRITABLE, _NEW_IOWR(CDROM_NEXT_WRITABLE, long), }, +#endif +#ifdef CDROM_LAST_WRITTEN + { CDROM_LAST_WRITTEN, _NEW_IOWR(CDROM_LAST_WRITTEN, long), }, +#endif + // PTB local test ioctls + { ENBD_TEST_IOCTL1, 0, }, // write an int + { ENBD_TEST_IOCTL2, 0, }, // read an int + { ENBD_TEST_IOCTL3, 0, }, // write and read an int + { ENBD_TEST_IOCTL4, 0, }, // read 256B + { ENBD_TEST_IOCTL5, 0, }, // r/w 256B + { ENBD_TEST_IOCTL6, _NEW_IORS(ENBD_TEST_IOCTL6), }, // read special + { ENBD_TEST_IOCTL7, _NEW_IORS(ENBD_TEST_IOCTL7), }, // r/w special + // PTB enbd special ioctls + { ENBD_REMOTE_CHECK, 0, }, // write an int + { ENBD_REMOTE_REVALIDATE, 0, }, + // PTB we must terminate with a 0,0 entry. + {0 , 0, }, +}; + +/* + * This should be the table of special methods for certain ioctls. + * The "new" code is the real index. It will have a size count of + * _IOC_SIZEMASK but the rest of it should be meaningful. The size is + * gotten by dynamic lookup using the size() function. + */ +static struct ioctl_special ioctl_special_tab[] = { + // PTB last entry must be all zeros + { 0, NULL, NULL, NULL, NULL, }, +}; + + + +#endif --- linux-2.6/drivers/block/Kconfig.orig 2005-07-14 22:39:24.692296236 +0200 +++ linux-2.6/drivers/block/Kconfig 2005-07-18 01:58:58.876380093 +0200 @@ -362,6 +362,15 @@ config BLK_DEV_UB If unsure, say N. +config ENBD + bool 'Enhanced network block device' + depends on NET + ---help--- + To use the ENBD support, you must say Y here and select one + of the driver's units (e.g. BLK_DEV_ENBD, BLK_DEV_ENBD_IOCTL). + +source "drivers/block/enbd/Kconfig" + config BLK_DEV_RAM tristate "RAM disk support" ---help---