From: Andy P. <at...@us...> - 2002-04-09 12:44:29
|
Update of /cvsroot/linux-vax/kernel-2.4/include/linux/raid In directory usw-pr-cvs1:/tmp/cvs-serv17906/linux/raid Modified Files: md.h md_compatible.h md_k.h md_u.h raid1.h raid5.h Added Files: multipath.h Log Message: sync 2.4.15 commit 3 --- NEW FILE --- #ifndef _MULTIPATH_H #define _MULTIPATH_H #include <linux/raid/md.h> struct multipath_info { int number; int raid_disk; kdev_t dev; /* * State bits: */ int operational; int spare; int used_slot; }; struct multipath_private_data { mddev_t *mddev; struct multipath_info multipaths[MD_SB_DISKS]; int nr_disks; int raid_disks; int working_disks; mdk_thread_t *thread; struct multipath_info *spare; md_spinlock_t device_lock; /* buffer pool */ /* buffer_heads that we have pre-allocated have b_pprev -> &freebh * and are linked into a stack using b_next * multipath_bh that are pre-allocated have MPBH_PreAlloc set. * All these variable are protected by device_lock */ struct multipath_bh *freer1; int freer1_blocked; int freer1_cnt; md_wait_queue_head_t wait_buffer; }; typedef struct multipath_private_data multipath_conf_t; /* * this is the only point in the RAID code where we violate * C type safety. mddev->private is an 'opaque' pointer. */ #define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) /* * this is our 'private' 'collective' MULTIPATH buffer head. * it contains information about what kind of IO operations were started * for this MULTIPATH operation, and about their status: */ struct multipath_bh { atomic_t remaining; /* 'have we finished' count, * used from IRQ handlers */ int cmd; unsigned long state; mddev_t *mddev; struct buffer_head *master_bh; struct buffer_head bh_req; struct multipath_bh *next_mp; /* next for retry or in free list */ }; /* bits for multipath_bh.state */ #define MPBH_Uptodate 1 #define MPBH_SyncPhase 2 #define MPBH_PreAlloc 3 /* this was pre-allocated, add to free list */ #endif Index: md.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/md.h,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- md.h 14 Jan 2001 16:48:35 -0000 1.1.1.1 +++ md.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -36,6 +36,7 @@ #include <linux/locks.h> #include <linux/kernel_stat.h> #include <asm/io.h> +#include <linux/completion.h> #include <linux/raid/md_compatible.h> /* @@ -77,10 +78,9 @@ extern void md_sync_acct(kdev_t dev, unsigned long nr_sectors); extern void md_recover_arrays (void); extern int md_check_ordering (mddev_t *mddev); -extern struct gendisk * find_gendisk (kdev_t dev); extern int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x); -extern int md_error (kdev_t mddev, kdev_t rdev); +extern int md_error (mddev_t *mddev, kdev_t rdev); extern int md_run_setup(void); extern void md_print_devices (void); Index: md_compatible.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/md_compatible.h,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- md_compatible.h 14 Jan 2001 16:48:38 -0000 1.1.1.1 +++ md_compatible.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -27,12 +27,14 @@ /* 000 */ #define md__get_free_pages(x,y) __get_free_pages(x,y) -#ifdef __i386__ +#if defined(__i386__) || defined(__x86_64__) /* 001 */ -extern __inline__ int md_cpu_has_mmx(void) +static __inline__ int md_cpu_has_mmx(void) { return test_bit(X86_FEATURE_MMX, &boot_cpu_data.x86_capability); } +#else +#define md_cpu_has_mmx(x) (0) #endif /* 002 */ @@ -51,7 +53,7 @@ #define md_put_user put_user /* 007 */ -extern inline int md_capable_admin(void) +static inline int md_capable_admin(void) { return capable(CAP_SYS_ADMIN); } @@ -60,7 +62,7 @@ #define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) /* 009 */ -extern inline void md_flush_signals (void) +static inline void md_flush_signals (void) { spin_lock(¤t->sigmask_lock); flush_signals(current); @@ -68,7 +70,7 @@ } /* 010 */ -extern inline void md_init_signals (void) +static inline void md_init_signals (void) { current->exit_signal = SIGCHLD; siginitsetinv(¤t->blocked, sigmask(SIGKILL)); Index: md_k.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/md_k.h,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- md_k.h 14 Jan 2001 16:48:36 -0000 1.1.1.1 +++ md_k.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -17,17 +17,18 @@ #define MD_RESERVED 0UL #define LINEAR 1UL -#define STRIPED 2UL -#define RAID0 STRIPED +#define RAID0 2UL #define RAID1 3UL #define RAID5 4UL #define TRANSLUCENT 5UL #define HSM 6UL -#define MAX_PERSONALITY 7UL +#define MULTIPATH 7UL +#define MAX_PERSONALITY 8UL -extern inline int pers_to_level (int pers) +static inline int pers_to_level (int pers) { switch (pers) { + case MULTIPATH: return -4; case HSM: return -3; case TRANSLUCENT: return -2; case LINEAR: return -1; @@ -35,12 +36,14 @@ case RAID1: return 1; case RAID5: return 5; } - panic("pers_to_level()"); + BUG(); + return MD_RESERVED; } -extern inline int level_to_pers (int level) +static inline int level_to_pers (int level) { switch (level) { + case -4: return MULTIPATH; case -3: return HSM; case -2: return TRANSLUCENT; case -1: return LINEAR; @@ -72,7 +75,7 @@ extern dev_mapping_t mddev_map [MAX_MD_DEVS]; -extern inline mddev_t * kdev_to_mddev (kdev_t dev) +static inline mddev_t * kdev_to_mddev (kdev_t dev) { if (MAJOR(dev) != MD_MAJOR) BUG(); @@ -90,62 +93,62 @@ */ #define MD_READAHEAD MAX_READAHEAD -extern inline int disk_faulty(mdp_disk_t * d) +static inline int disk_faulty(mdp_disk_t * d) { return d->state & (1 << MD_DISK_FAULTY); } -extern inline int disk_active(mdp_disk_t * d) +static inline int disk_active(mdp_disk_t * d) { return d->state & (1 << MD_DISK_ACTIVE); } -extern inline int disk_sync(mdp_disk_t * d) +static inline int disk_sync(mdp_disk_t * d) { return d->state & (1 << MD_DISK_SYNC); } -extern inline int disk_spare(mdp_disk_t * d) +static inline int disk_spare(mdp_disk_t * d) { return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); } -extern inline int disk_removed(mdp_disk_t * d) +static inline int disk_removed(mdp_disk_t * d) { return d->state & (1 << MD_DISK_REMOVED); } -extern inline void mark_disk_faulty(mdp_disk_t * d) +static inline void mark_disk_faulty(mdp_disk_t * d) { d->state |= (1 << MD_DISK_FAULTY); } -extern inline void mark_disk_active(mdp_disk_t * d) +static inline void mark_disk_active(mdp_disk_t * d) { d->state |= (1 << MD_DISK_ACTIVE); } -extern inline void mark_disk_sync(mdp_disk_t * d) +static inline void mark_disk_sync(mdp_disk_t * d) { d->state |= (1 << MD_DISK_SYNC); } -extern inline void mark_disk_spare(mdp_disk_t * d) +static inline void mark_disk_spare(mdp_disk_t * d) { d->state = 0; } -extern inline void mark_disk_removed(mdp_disk_t * d) +static inline void mark_disk_removed(mdp_disk_t * d) { d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); } -extern inline void mark_disk_inactive(mdp_disk_t * d) +static inline void mark_disk_inactive(mdp_disk_t * d) { d->state &= ~(1 << MD_DISK_ACTIVE); } -extern inline void mark_disk_nonsync(mdp_disk_t * d) +static inline void mark_disk_nonsync(mdp_disk_t * d) { d->state &= ~(1 << MD_DISK_SYNC); } @@ -170,6 +173,7 @@ mdp_super_t *sb; unsigned long sb_offset; + int alias_device; /* device alias to the same disk */ int faulty; /* if faulty do not issue IO requests */ int desc_nr; /* descriptor index in the superblock */ }; @@ -245,18 +249,19 @@ * number. This will have to change to dynamic allocation * once we start supporting partitioning of md devices. */ -extern inline int mdidx (mddev_t * mddev) +static inline int mdidx (mddev_t * mddev) { return mddev->__minor; } -extern inline kdev_t mddev_to_kdev(mddev_t * mddev) +static inline kdev_t mddev_to_kdev(mddev_t * mddev) { return MKDEV(MD_MAJOR, mdidx(mddev)); } extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); +extern mdp_disk_t *get_spare(mddev_t *mddev); /* * iterates through some rdev ringlist. It's safe to remove the @@ -304,12 +309,12 @@ tmp = tmp->next, tmp->prev != &all_mddevs \ ; ) -extern inline int lock_mddev (mddev_t * mddev) +static inline int lock_mddev (mddev_t * mddev) { return down_interruptible(&mddev->reconfig_sem); } -extern inline void unlock_mddev (mddev_t * mddev) +static inline void unlock_mddev (mddev_t * mddev) { up(&mddev->reconfig_sem); } @@ -322,7 +327,7 @@ void *data; md_wait_queue_head_t wqueue; unsigned long flags; - struct semaphore *sem; + struct completion *event; struct task_struct *tsk; const char *name; } mdk_thread_t; @@ -363,6 +368,31 @@ if (condition) \ break; \ __wait_event_lock_irq(wq, condition, lock); \ +} while (0) + + +#define __wait_disk_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + run_task_queue(&tq_disk); \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_disk_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_disk_event(wq, condition); \ } while (0) #endif Index: md_u.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/md_u.h,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- md_u.h 25 Feb 2001 23:14:50 -0000 1.1.1.2 +++ md_u.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -35,6 +35,7 @@ #define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) /* usage */ #define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) Index: raid1.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/raid1.h,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -u -r1.1.1.1 -r1.2 --- raid1.h 14 Jan 2001 16:48:39 -0000 1.1.1.1 +++ raid1.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -42,7 +42,10 @@ */ struct buffer_head *freebh; int freebh_cnt; /* how many are on the list */ + int freebh_blocked; struct raid1_bh *freer1; + int freer1_blocked; + int freer1_cnt; struct raid1_bh *freebuf; /* each bh_req has a page allocated */ md_wait_queue_head_t wait_buffer; Index: raid5.h =================================================================== RCS file: /cvsroot/linux-vax/kernel-2.4/include/linux/raid/raid5.h,v retrieving revision 1.1.1.2 retrieving revision 1.2 diff -u -r1.1.1.2 -r1.2 --- raid5.h 25 Feb 2001 23:14:50 -0000 1.1.1.2 +++ raid5.h 9 Apr 2002 12:44:17 -0000 1.2 @@ -132,6 +132,7 @@ struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */ struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */ struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */ + struct page *bh_page[MD_SB_DISKS]; /* saved bh_cache[n]->b_page when reading around the cache */ unsigned long sector; /* sector of this row */ int size; /* buffers size */ int pd_idx; /* parity disk index */ @@ -157,6 +158,32 @@ #define STRIPE_HANDLE 2 #define STRIPE_SYNCING 3 #define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 + +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplg call is made. (the tq_disk list is run). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the delayed queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leave nothing locked. + */ + struct disk_info { kdev_t dev; @@ -181,14 +208,21 @@ int max_nr_stripes; struct list_head handle_list; /* stripes needing handling */ + struct list_head delayed_list; /* stripes that have plugged requests */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ /* * Free stripes pool */ atomic_t active_stripes; struct list_head inactive_list; md_wait_queue_head_t wait_for_stripe; - + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ md_spinlock_t device_lock; + + int plugged; + struct tq_struct plug_tq; }; typedef struct raid5_private_data raid5_conf_t; |