WebSVN - shark - Blame - Rev 423 - /shark/trunk/drivers/linuxc26/include/linux/raid/raid5.h

Rev	Author	Line No.	Line
423	giacomo	1	#ifndef _RAID5_H
		2	#define _RAID5_H
		3
		4	#include <linux/raid/md.h>
		5	#include <linux/raid/xor.h>
		6
		7	/*
		8	*
		9	* Each stripe contains one buffer per disc. Each buffer can be in
		10	* one of a number of states stored in "flags". Changes between
		11	* these states happen almost exclusively under a per-stripe
		12	* spinlock. Some very specific changes can happen in bi_end_io, and
		13	* these are not protected by the spin lock.
		14	*
		15	* The flag bits that are used to represent these states are:
		16	* R5_UPTODATE and R5_LOCKED
		17	*
		18	* State Empty == !UPTODATE, !LOCK
		19	* We have no data, and there is no active request
		20	* State Want == !UPTODATE, LOCK
		21	* A read request is being submitted for this block
		22	* State Dirty == UPTODATE, LOCK
		23	* Some new data is in this buffer, and it is being written out
		24	* State Clean == UPTODATE, !LOCK
		25	* We have valid data which is the same as on disc
		26	*
		27	* The possible state transitions are:
		28	*
		29	* Empty -> Want - on read or write to get old data for parity calc
		30	* Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
		31	* Empty -> Clean - on compute_block when computing a block for failed drive
		32	* Want -> Empty - on failed read
		33	* Want -> Clean - on successful completion of read request
		34	* Dirty -> Clean - on successful completion of write request
		35	* Dirty -> Clean - on failed write
		36	* Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
		37	*
		38	* The Want->Empty, Want->Clean, Dirty->Clean, transitions
		39	* all happen in b_end_io at interrupt time.
		40	* Each sets the Uptodate bit before releasing the Lock bit.
		41	* This leaves one multi-stage transition:
		42	* Want->Dirty->Clean
		43	* This is safe because thinking that a Clean buffer is actually dirty
		44	* will at worst delay some action, and the stripe will be scheduled
		45	* for attention after the transition is complete.
		46	*
		47	* There is one possibility that is not covered by these states. That
		48	* is if one drive has failed and there is a spare being rebuilt. We
		49	* can't distinguish between a clean block that has been generated
		50	* from parity calculations, and a clean block that has been
		51	* successfully written to the spare ( or to parity when resyncing).
		52	* To distingush these states we have a stripe bit STRIPE_INSYNC that
		53	* is set whenever a write is scheduled to the spare, or to the parity
		54	* disc if there is no spare. A sync request clears this bit, and
		55	* when we find it set with no buffers locked, we know the sync is
		56	* complete.
		57	*
		58	* Buffers for the md device that arrive via make_request are attached
		59	* to the appropriate stripe in one of two lists linked on b_reqnext.
		60	* One list (bh_read) for read requests, one (bh_write) for write.
		61	* There should never be more than one buffer on the two lists
		62	* together, but we are not guaranteed of that so we allow for more.
		63	*
		64	* If a buffer is on the read list when the associated cache buffer is
		65	* Uptodate, the data is copied into the read buffer and it's b_end_io
		66	* routine is called. This may happen in the end_request routine only
		67	* if the buffer has just successfully been read. end_request should
		68	* remove the buffers from the list and then set the Uptodate bit on
		69	* the buffer. Other threads may do this only if they first check
		70	* that the Uptodate bit is set. Once they have checked that they may
		71	* take buffers off the read queue.
		72	*
		73	* When a buffer on the write list is committed for write is it copied
		74	* into the cache buffer, which is then marked dirty, and moved onto a
		75	* third list, the written list (bh_written). Once both the parity
		76	* block and the cached buffer are successfully written, any buffer on
		77	* a written list can be returned with b_end_io.
		78	*
		79	* The write list and read list both act as fifos. The read list is
		80	* protected by the device_lock. The write and written lists are
		81	* protected by the stripe lock. The device_lock, which can be
		82	* claimed while the stipe lock is held, is only for list
		83	* manipulations and will only be held for a very short time. It can
		84	* be claimed from interrupts.
		85	*
		86	*
		87	* Stripes in the stripe cache can be on one of two lists (or on
		88	* neither). The "inactive_list" contains stripes which are not
		89	* currently being used for any request. They can freely be reused
		90	* for another stripe. The "handle_list" contains stripes that need
		91	* to be handled in some way. Both of these are fifo queues. Each
		92	* stripe is also (potentially) linked to a hash bucket in the hash
		93	* table so that it can be found by sector number. Stripes that are
		94	* not hashed must be on the inactive_list, and will normally be at
		95	* the front. All stripes start life this way.
		96	*
		97	* The inactive_list, handle_list and hash bucket lists are all protected by the
		98	* device_lock.
		99	* - stripes on the inactive_list never have their stripe_lock held.
		100	* - stripes have a reference counter. If count==0, they are on a list.
		101	* - If a stripe might need handling, STRIPE_HANDLE is set.
		102	* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
		103	* handle_list else inactive_list
		104	*
		105	* This, combined with the fact that STRIPE_HANDLE is only ever
		106	* cleared while a stripe has a non-zero count means that if the
		107	* refcount is 0 and STRIPE_HANDLE is set, then it is on the
		108	* handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
		109	* the stripe is on inactive_list.
		110	*
		111	* The possible transitions are:
		112	* activate an unhashed/inactive stripe (get_active_stripe())
		113	* lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
		114	* activate a hashed, possibly active stripe (get_active_stripe())
		115	* lockdev check-hash if(!cnt++)unlink-stripe unlockdev
		116	* attach a request to an active stripe (add_stripe_bh())
		117	* lockdev attach-buffer unlockdev
		118	* handle a stripe (handle_stripe())
		119	* lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
		120	* release an active stripe (release_stripe())
		121	* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
		122	*
		123	* The refcount counts each thread that have activated the stripe,
		124	* plus raid5d if it is handling it, plus one for each active request
		125	* on a cached buffer.
		126	*/
		127
		128	struct stripe_head {
		129	struct stripe_head hash_next, hash_pprev; / hash pointers */
		130	struct list_head lru; /* inactive_list or handle_list */
		131	struct raid5_private_data *raid_conf;
		132	sector_t sector; /* sector of this row */
		133	int pd_idx; /* parity disk index */
		134	unsigned long state; /* state flags */
		135	atomic_t count; /* nr of active thread/requests */
		136	spinlock_t lock;
		137	struct r5dev {
		138	struct bio req;
		139	struct bio_vec vec;
		140	struct page *page;
		141	struct bio toread, towrite, *written;
		142	sector_t sector; /* sector of this page */
		143	unsigned long flags;
		144	} dev[1]; /* allocated with extra space depending of RAID geometry */
		145	};
		146	/* Flags */
		147	#define R5_UPTODATE 0 /* page contains current data */
		148	#define R5_LOCKED 1 /* IO has been submitted on "req" */
		149	#define R5_OVERWRITE 2 /* towrite covers whole page */
		150	/* and some that are internal to handle_stripe */
		151	#define R5_Insync 3 /* rdev && rdev->in_sync at start */
		152	#define R5_Wantread 4 /* want to schedule a read */
		153	#define R5_Wantwrite 5
		154	#define R5_Syncio 6 /* this io need to be accounted as resync io */
		155
		156	/*
		157	* Write method
		158	*/
		159	#define RECONSTRUCT_WRITE 1
		160	#define READ_MODIFY_WRITE 2
		161	/* not a write method, but a compute_parity mode */
		162	#define CHECK_PARITY 3
		163
		164	/*
		165	* Stripe state
		166	*/
		167	#define STRIPE_ERROR 1
		168	#define STRIPE_HANDLE 2
		169	#define STRIPE_SYNCING 3
		170	#define STRIPE_INSYNC 4
		171	#define STRIPE_PREREAD_ACTIVE 5
		172	#define STRIPE_DELAYED 6
		173
		174	/*
		175	* Plugging:
		176	*
		177	* To improve write throughput, we need to delay the handling of some
		178	* stripes until there has been a chance that several write requests
		179	* for the one stripe have all been collected.
		180	* In particular, any write request that would require pre-reading
		181	* is put on a "delayed" queue until there are no stripes currently
		182	* in a pre-read phase. Further, if the "delayed" queue is empty when
		183	* a stripe is put on it then we "plug" the queue and do not process it
		184	* until an unplug call is made. (blk_run_queues is run).
		185	*
		186	* When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
		187	* it to the count of prereading stripes.
		188	* When write is initiated, or the stripe refcnt == 0 (just in case) we
		189	* clear the PREREAD_ACTIVE flag and decrement the count
		190	* Whenever the delayed queue is empty and the device is not plugged, we
		191	* move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
		192	* In stripe_handle, if we find pre-reading is necessary, we do it if
		193	* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
		194	* HANDLE gets cleared if stripe_handle leave nothing locked.
		195	*/
		196
		197
		198	struct disk_info {
		199	mdk_rdev_t *rdev;
		200	};
		201
		202	struct raid5_private_data {
		203	struct stripe_head **stripe_hashtbl;
		204	mddev_t *mddev;
		205	struct disk_info *spare;
		206	int chunk_size, level, algorithm;
		207	int raid_disks, working_disks, failed_disks;
		208	int max_nr_stripes;
		209
		210	struct list_head handle_list; /* stripes needing handling */
		211	struct list_head delayed_list; /* stripes that have plugged requests */
		212	atomic_t preread_active_stripes; /* stripes with scheduled io */
		213
		214	char cache_name[20];
		215	kmem_cache_t slab_cache; / for allocating stripes */
		216	/*
		217	* Free stripes pool
		218	*/
		219	atomic_t active_stripes;
		220	struct list_head inactive_list;
		221	wait_queue_head_t wait_for_stripe;
		222	int inactive_blocked; /* release of inactive stripes blocked,
		223	* waiting for 25% to be free
		224	*/
		225	spinlock_t device_lock;
		226	struct disk_info disks[0];
		227	};
		228
		229	typedef struct raid5_private_data raid5_conf_t;
		230
		231	#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
		232
		233	/*
		234	* Our supported algorithms
		235	*/
		236	#define ALGORITHM_LEFT_ASYMMETRIC 0
		237	#define ALGORITHM_RIGHT_ASYMMETRIC 1
		238	#define ALGORITHM_LEFT_SYMMETRIC 2
		239	#define ALGORITHM_RIGHT_SYMMETRIC 3
		240
		241	#endif

Subversion Repositories shark

(root)/shark/trunk/drivers/linuxc26/include/linux/raid/raid5.h - Rev 423