mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Merge branch 'lock-contention-on-arcs_mtx-final'
Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf Closes #3115 Closes #3481
This commit is contained in:
		
						commit
						06358ea16e
					
				| @ -191,12 +191,10 @@ def get_arc_summary(Kstat): | ||||
|     ### ARC Misc. ### | ||||
|     deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] | ||||
|     mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] | ||||
|     recycle_miss = Kstat["kstat.zfs.misc.arcstats.recycle_miss"] | ||||
| 
 | ||||
|     ### ARC Misc. ### | ||||
|     output["arc_misc"] = {} | ||||
|     output["arc_misc"]["deleted"] = fHits(deleted) | ||||
|     output["arc_misc"]['recycle_miss'] = fHits(recycle_miss) | ||||
|     output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) | ||||
|     output["arc_misc"]['evict_skips'] = fHits(mutex_miss) | ||||
| 
 | ||||
| @ -302,8 +300,6 @@ def _arc_summary(Kstat): | ||||
|     ### ARC Misc. ### | ||||
|     sys.stdout.write("ARC Misc:\n") | ||||
|     sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) | ||||
|     sys.stdout.write("\tRecycle Misses:\t\t\t\t%s\n" % | ||||
|             arc['arc_misc']['recycle_miss']) | ||||
|     sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % | ||||
|             arc['arc_misc']['mutex_miss']) | ||||
|     sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % | ||||
|  | ||||
| @ -82,7 +82,6 @@ cols = { | ||||
|     "mrug":       [4, 1000, "MRU Ghost List hits per second"], | ||||
|     "eskip":      [5, 1000, "evict_skip per second"], | ||||
|     "mtxmis":     [6, 1000, "mutex_miss per second"], | ||||
|     "rmis":       [4, 1000, "recycle_miss per second"], | ||||
|     "dread":      [5, 1000, "Demand accesses per second"], | ||||
|     "pread":      [5, 1000, "Prefetch accesses per second"], | ||||
|     "l2hits":     [6, 1000, "L2ARC hits per second"], | ||||
| @ -406,7 +405,6 @@ def calculate(): | ||||
|     v["mrug"] = d["mru_ghost_hits"] / sint | ||||
|     v["mfug"] = d["mfu_ghost_hits"] / sint | ||||
|     v["eskip"] = d["evict_skip"] / sint | ||||
|     v["rmis"] = d["recycle_miss"] / sint | ||||
|     v["mtxmis"] = d["mutex_miss"] / sint | ||||
| 
 | ||||
|     if l2exist: | ||||
|  | ||||
| @ -1250,7 +1250,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, | ||||
| 	print_indirect(bp, zb, dnp); | ||||
| 
 | ||||
| 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		arc_flags_t flags = ARC_FLAG_WAIT; | ||||
| 		int i; | ||||
| 		blkptr_t *cbp; | ||||
| 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; | ||||
|  | ||||
| @ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) | ||||
| 		 * assign an arcbuf to a dbuf. | ||||
| 		 */ | ||||
| 		for (j = 0; j < s; j++) { | ||||
| 			if (i != 5) { | ||||
| 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { | ||||
| 				bigbuf_arcbufs[j] = | ||||
| 				    dmu_request_arcbuf(bonus_db, chunksize); | ||||
| 			} else { | ||||
| @ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) | ||||
| 			umem_free(packbuf, packsize); | ||||
| 			umem_free(bigbuf, bigsize); | ||||
| 			for (j = 0; j < s; j++) { | ||||
| 				if (i != 5) { | ||||
| 				if (i != 5 || | ||||
| 				    chunksize < (SPA_MINBLOCKSIZE * 2)) { | ||||
| 					dmu_return_arcbuf(bigbuf_arcbufs[j]); | ||||
| 				} else { | ||||
| 					dmu_return_arcbuf( | ||||
| @ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) | ||||
| 		} | ||||
| 		for (off = bigoff, j = 0; j < s; j++, off += chunksize) { | ||||
| 			dmu_buf_t *dbt; | ||||
| 			if (i != 5) { | ||||
| 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { | ||||
| 				bcopy((caddr_t)bigbuf + (off - bigoff), | ||||
| 				    bigbuf_arcbufs[j]->b_data, chunksize); | ||||
| 			} else { | ||||
| @ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) | ||||
| 				VERIFY(dmu_buf_hold(os, bigobj, off, | ||||
| 				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); | ||||
| 			} | ||||
| 			if (i != 5) { | ||||
| 			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { | ||||
| 				dmu_assign_arcbuf(bonus_db, off, | ||||
| 				    bigbuf_arcbufs[j], tx); | ||||
| 			} else { | ||||
|  | ||||
| @ -33,6 +33,7 @@ COMMON_H = \ | ||||
| 	$(top_srcdir)/include/sys/efi_partition.h \
 | ||||
| 	$(top_srcdir)/include/sys/metaslab.h \
 | ||||
| 	$(top_srcdir)/include/sys/metaslab_impl.h \
 | ||||
| 	$(top_srcdir)/include/sys/multilist.h \
 | ||||
| 	$(top_srcdir)/include/sys/nvpair.h \
 | ||||
| 	$(top_srcdir)/include/sys/nvpair_impl.h \
 | ||||
| 	$(top_srcdir)/include/sys/range_tree.h \
 | ||||
| @ -53,6 +54,7 @@ COMMON_H = \ | ||||
| 	$(top_srcdir)/include/sys/trace_dbuf.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_dmu.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_dnode.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_multilist.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_txg.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_zil.h \
 | ||||
| 	$(top_srcdir)/include/sys/trace_zrlock.h \
 | ||||
|  | ||||
| @ -38,6 +38,12 @@ extern "C" { | ||||
| #include <sys/spa.h> | ||||
| #include <sys/refcount.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Used by arc_flush() to inform arc_evict_state() that it should evict | ||||
|  * all available buffers from the arc state being passed in. | ||||
|  */ | ||||
| #define	ARC_EVICT_ALL	-1ULL | ||||
| 
 | ||||
| typedef struct arc_buf_hdr arc_buf_hdr_t; | ||||
| typedef struct arc_buf arc_buf_t; | ||||
| typedef struct arc_prune arc_prune_t; | ||||
| @ -53,10 +59,65 @@ arc_done_func_t arc_getbuf_func; | ||||
| struct arc_prune { | ||||
| 	arc_prune_func_t	*p_pfunc; | ||||
| 	void			*p_private; | ||||
| 	uint64_t		p_adjust; | ||||
| 	list_node_t		p_node; | ||||
| 	refcount_t		p_refcnt; | ||||
| }; | ||||
| 
 | ||||
| typedef enum arc_strategy { | ||||
| 	ARC_STRATEGY_META_ONLY		= 0, /* Evict only meta data buffers */ | ||||
| 	ARC_STRATEGY_META_BALANCED	= 1, /* Evict data buffers if needed */ | ||||
| } arc_strategy_t; | ||||
| 
 | ||||
| typedef enum arc_flags | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Public flags that can be passed into the ARC by external consumers. | ||||
| 	 */ | ||||
| 	ARC_FLAG_NONE			= 1 << 0,	/* No flags set */ | ||||
| 	ARC_FLAG_WAIT			= 1 << 1,	/* perform sync I/O */ | ||||
| 	ARC_FLAG_NOWAIT			= 1 << 2,	/* perform async I/O */ | ||||
| 	ARC_FLAG_PREFETCH		= 1 << 3,	/* I/O is a prefetch */ | ||||
| 	ARC_FLAG_CACHED			= 1 << 4,	/* I/O was in cache */ | ||||
| 	ARC_FLAG_L2CACHE		= 1 << 5,	/* cache in L2ARC */ | ||||
| 	ARC_FLAG_L2COMPRESS		= 1 << 6,	/* compress in L2ARC */ | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Private ARC flags.  These flags are private ARC only flags that | ||||
| 	 * will show up in b_flags in the arc_hdr_buf_t. These flags should | ||||
| 	 * only be set by ARC code. | ||||
| 	 */ | ||||
| 	ARC_FLAG_IN_HASH_TABLE		= 1 << 7,	/* buffer is hashed */ | ||||
| 	ARC_FLAG_IO_IN_PROGRESS		= 1 << 8,	/* I/O in progress */ | ||||
| 	ARC_FLAG_IO_ERROR		= 1 << 9,	/* I/O failed for buf */ | ||||
| 	ARC_FLAG_FREED_IN_READ		= 1 << 10,	/* freed during read */ | ||||
| 	ARC_FLAG_BUF_AVAILABLE		= 1 << 11,	/* block not in use */ | ||||
| 	ARC_FLAG_INDIRECT		= 1 << 12,	/* indirect block */ | ||||
| 	ARC_FLAG_L2_WRITING		= 1 << 13,	/* write in progress */ | ||||
| 	ARC_FLAG_L2_EVICTED		= 1 << 14,	/* evicted during I/O */ | ||||
| 	ARC_FLAG_L2_WRITE_HEAD		= 1 << 15,	/* head of write list */ | ||||
| 	/* indicates that the buffer contains metadata (otherwise, data) */ | ||||
| 	ARC_FLAG_BUFC_METADATA		= 1 << 16, | ||||
| 
 | ||||
| 	/* Flags specifying whether optional hdr struct fields are defined */ | ||||
| 	ARC_FLAG_HAS_L1HDR		= 1 << 17, | ||||
| 	ARC_FLAG_HAS_L2HDR		= 1 << 18, | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The arc buffer's compression mode is stored in the top 7 bits of the | ||||
| 	 * flags field, so these dummy flags are included so that MDB can | ||||
| 	 * interpret the enum properly. | ||||
| 	 */ | ||||
| 	ARC_FLAG_COMPRESS_0		= 1 << 24, | ||||
| 	ARC_FLAG_COMPRESS_1		= 1 << 25, | ||||
| 	ARC_FLAG_COMPRESS_2		= 1 << 26, | ||||
| 	ARC_FLAG_COMPRESS_3		= 1 << 27, | ||||
| 	ARC_FLAG_COMPRESS_4		= 1 << 28, | ||||
| 	ARC_FLAG_COMPRESS_5		= 1 << 29, | ||||
| 	ARC_FLAG_COMPRESS_6		= 1 << 30 | ||||
| 
 | ||||
| } arc_flags_t; | ||||
| 
 | ||||
| struct arc_buf { | ||||
| 	arc_buf_hdr_t		*b_hdr; | ||||
| 	arc_buf_t		*b_next; | ||||
| @ -71,15 +132,6 @@ typedef enum arc_buf_contents { | ||||
| 	ARC_BUFC_METADATA,			/* buffer contains metadata */ | ||||
| 	ARC_BUFC_NUMTYPES | ||||
| } arc_buf_contents_t; | ||||
| /*
 | ||||
|  * These are the flags we pass into calls to the arc | ||||
|  */ | ||||
| #define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */ | ||||
| #define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */ | ||||
| #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */ | ||||
| #define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */ | ||||
| #define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */ | ||||
| #define	ARC_L2COMPRESS	(1 << 6)	/* compress in L2ARC */ | ||||
| 
 | ||||
| /*
 | ||||
|  * The following breakdows of arc_size exist for kstat only. | ||||
| @ -106,7 +158,6 @@ typedef enum arc_state_type { | ||||
| typedef struct arc_buf_info { | ||||
| 	arc_state_type_t	abi_state_type; | ||||
| 	arc_buf_contents_t	abi_state_contents; | ||||
| 	uint64_t		abi_state_index; | ||||
| 	uint32_t		abi_flags; | ||||
| 	uint32_t		abi_datacnt; | ||||
| 	uint64_t		abi_size; | ||||
| @ -146,7 +197,7 @@ int arc_referenced(arc_buf_t *buf); | ||||
| 
 | ||||
| int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, | ||||
|     arc_done_func_t *done, void *private, zio_priority_t priority, int flags, | ||||
|     uint32_t *arc_flags, const zbookmark_phys_t *zb); | ||||
|     arc_flags_t *arc_flags, const zbookmark_phys_t *zb); | ||||
| zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, | ||||
|     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, | ||||
|     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, | ||||
| @ -160,7 +211,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); | ||||
| void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); | ||||
| boolean_t arc_clear_callback(arc_buf_t *buf); | ||||
| 
 | ||||
| void arc_flush(spa_t *spa); | ||||
| void arc_flush(spa_t *spa, boolean_t retry); | ||||
| void arc_tempreserve_clear(uint64_t reserve); | ||||
| int arc_tempreserve_space(uint64_t reserve, uint64_t txg); | ||||
| 
 | ||||
|  | ||||
| @ -67,15 +67,25 @@ extern "C" { | ||||
|  */ | ||||
| 
 | ||||
| typedef struct arc_state { | ||||
| 	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */ | ||||
| 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */ | ||||
| 	uint64_t arcs_size;	/* total amount of data in this state */ | ||||
| 	kmutex_t arcs_mtx; | ||||
| 	/*
 | ||||
| 	 * list of evictable buffers | ||||
| 	 */ | ||||
| 	multilist_t arcs_list[ARC_BUFC_NUMTYPES]; | ||||
| 	/*
 | ||||
| 	 * total amount of evictable data in this state | ||||
| 	 */ | ||||
| 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; | ||||
| 	/*
 | ||||
| 	 * total amount of data in this state; this includes: evictable, | ||||
| 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. | ||||
| 	 */ | ||||
| 	uint64_t arcs_size; | ||||
| 	/*
 | ||||
| 	 * supports the "dbufs" kstat | ||||
| 	 */ | ||||
| 	arc_state_type_t arcs_state; | ||||
| } arc_state_t; | ||||
| 
 | ||||
| typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; | ||||
| 
 | ||||
| typedef struct arc_callback arc_callback_t; | ||||
| 
 | ||||
| struct arc_callback { | ||||
| @ -96,31 +106,49 @@ struct arc_write_callback { | ||||
| 	arc_buf_t	*awcb_buf; | ||||
| }; | ||||
| 
 | ||||
| struct arc_buf_hdr { | ||||
| 	/* protected by hash lock */ | ||||
| 	dva_t			b_dva; | ||||
| 	uint64_t		b_birth; | ||||
| 	uint64_t		b_cksum0; | ||||
| 
 | ||||
| /*
 | ||||
|  * ARC buffers are separated into multiple structs as a memory saving measure: | ||||
|  *   - Common fields struct, always defined, and embedded within it: | ||||
|  *       - L2-only fields, always allocated but undefined when not in L2ARC | ||||
|  *       - L1-only fields, only allocated when in L1ARC | ||||
|  * | ||||
|  *           Buffer in L1                     Buffer only in L2 | ||||
|  *    +------------------------+          +------------------------+ | ||||
|  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          | | ||||
|  *    |                        |          |                        | | ||||
|  *    |                        |          |                        | | ||||
|  *    |                        |          |                        | | ||||
|  *    +------------------------+          +------------------------+ | ||||
|  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        | | ||||
|  *    | (undefined if L1-only) |          |                        | | ||||
|  *    +------------------------+          +------------------------+ | ||||
|  *    | l1arc_buf_hdr_t        | | ||||
|  *    |                        | | ||||
|  *    |                        | | ||||
|  *    |                        | | ||||
|  *    |                        | | ||||
|  *    +------------------------+ | ||||
|  * | ||||
|  * Because it's possible for the L2ARC to become extremely large, we can wind | ||||
|  * up eating a lot of memory in L2ARC buffer headers, so the size of a header | ||||
|  * is minimized by only allocating the fields necessary for an L1-cached buffer | ||||
|  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and | ||||
|  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple | ||||
|  * words in pointers. arc_hdr_realloc() is used to switch a header between | ||||
|  * these two allocation states. | ||||
|  */ | ||||
| typedef struct l1arc_buf_hdr { | ||||
| 	kmutex_t		b_freeze_lock; | ||||
| 	zio_cksum_t		*b_freeze_cksum; | ||||
| 
 | ||||
| 	arc_buf_hdr_t		*b_hash_next; | ||||
| 	arc_buf_t		*b_buf; | ||||
| 	uint32_t		b_flags; | ||||
| 	uint32_t		b_datacnt; | ||||
| 
 | ||||
| 	arc_callback_t		*b_acb; | ||||
| 	/* for waiting on writes to complete */ | ||||
| 	kcondvar_t		b_cv; | ||||
| 
 | ||||
| 	/* immutable */ | ||||
| 	arc_buf_contents_t	b_type; | ||||
| 	uint64_t		b_size; | ||||
| 	uint64_t		b_spa; | ||||
| 
 | ||||
| 	/* protected by arc state mutex */ | ||||
| 	arc_state_t		*b_state; | ||||
| 	list_node_t		b_arc_node; | ||||
| 	multilist_node_t	b_arc_node; | ||||
| 
 | ||||
| 	/* updated atomically */ | ||||
| 	clock_t			b_arc_access; | ||||
| @ -133,9 +161,10 @@ struct arc_buf_hdr { | ||||
| 	/* self protecting */ | ||||
| 	refcount_t		b_refcnt; | ||||
| 
 | ||||
| 	l2arc_buf_hdr_t		*b_l2hdr; | ||||
| 	list_node_t		b_l2node; | ||||
| }; | ||||
| 	arc_callback_t		*b_acb; | ||||
| 	/* temporary buffer holder for in-flight compressed data */ | ||||
| 	void			*b_tmp_cdata; | ||||
| } l1arc_buf_hdr_t; | ||||
| 
 | ||||
| typedef struct l2arc_dev { | ||||
| 	vdev_t			*l2ad_vdev;	/* vdev */ | ||||
| @ -146,15 +175,51 @@ typedef struct l2arc_dev { | ||||
| 	uint64_t		l2ad_evict;	/* last addr eviction reached */ | ||||
| 	boolean_t		l2ad_first;	/* first sweep through */ | ||||
| 	boolean_t		l2ad_writing;	/* currently writing */ | ||||
| 	list_t			*l2ad_buflist;	/* buffer list */ | ||||
| 	kmutex_t		l2ad_mtx;	/* lock for buffer list */ | ||||
| 	list_t			l2ad_buflist;	/* buffer list */ | ||||
| 	list_node_t		l2ad_node;	/* device list node */ | ||||
| } l2arc_dev_t; | ||||
| 
 | ||||
| typedef struct l2arc_buf_hdr { | ||||
| 	/* protected by arc_buf_hdr mutex */ | ||||
| 	l2arc_dev_t		*b_dev;		/* L2ARC device */ | ||||
| 	uint64_t		b_daddr;	/* disk address, offset byte */ | ||||
| 	/* real alloc'd buffer size depending on b_compress applied */ | ||||
| 	uint32_t		b_hits; | ||||
| 	int32_t			b_asize; | ||||
| 
 | ||||
| 	list_node_t		b_l2node; | ||||
| } l2arc_buf_hdr_t; | ||||
| 
 | ||||
| typedef struct l2arc_write_callback { | ||||
| 	l2arc_dev_t	*l2wcb_dev;		/* device info */ | ||||
| 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */ | ||||
| } l2arc_write_callback_t; | ||||
| 
 | ||||
| struct arc_buf_hdr { | ||||
| 	/* protected by hash lock */ | ||||
| 	dva_t			b_dva; | ||||
| 	uint64_t		b_birth; | ||||
| 	/*
 | ||||
| 	 * Even though this checksum is only set/verified when a buffer is in | ||||
| 	 * the L1 cache, it needs to be in the set of common fields because it | ||||
| 	 * must be preserved from the time before a buffer is written out to | ||||
| 	 * L2ARC until after it is read back in. | ||||
| 	 */ | ||||
| 	zio_cksum_t		*b_freeze_cksum; | ||||
| 
 | ||||
| 	arc_buf_hdr_t		*b_hash_next; | ||||
| 	arc_flags_t		b_flags; | ||||
| 
 | ||||
| 	/* immutable */ | ||||
| 	int32_t			b_size; | ||||
| 	uint64_t		b_spa; | ||||
| 
 | ||||
| 	/* L2ARC fields. Undefined when not in L2ARC. */ | ||||
| 	l2arc_buf_hdr_t		b_l2hdr; | ||||
| 	/* L1ARC fields. Undefined when in l2arc_only state */ | ||||
| 	l1arc_buf_hdr_t		b_l1hdr; | ||||
| }; | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|  | ||||
							
								
								
									
										105
									
								
								include/sys/multilist.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								include/sys/multilist.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,105 @@ | ||||
| /*
 | ||||
|  * CDDL HEADER START | ||||
|  * | ||||
|  * This file and its contents are supplied under the terms of the | ||||
|  * Common Development and Distribution License ("CDDL"), version 1.0. | ||||
|  * You may only use this file in accordance with the terms of version | ||||
|  * 1.0 of the CDDL. | ||||
|  * | ||||
|  * A full copy of the text of the CDDL should have accompanied this | ||||
|  * source.  A copy of the CDDL is also available via the Internet at | ||||
|  * http://www.illumos.org/license/CDDL.
 | ||||
|  * | ||||
|  * CDDL HEADER END | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2013, 2014 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef	_SYS_MULTILIST_H | ||||
| #define	_SYS_MULTILIST_H | ||||
| 
 | ||||
| #include <sys/zfs_context.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| 
 | ||||
| typedef list_node_t multilist_node_t; | ||||
| typedef struct multilist multilist_t; | ||||
| typedef struct multilist_sublist multilist_sublist_t; | ||||
| typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); | ||||
| 
 | ||||
| struct multilist_sublist { | ||||
| 	/*
 | ||||
| 	 * The mutex used internally to implement thread safe insertions | ||||
| 	 * and removals to this individual sublist. It can also be locked | ||||
| 	 * by a consumer using multilist_sublist_{lock,unlock}, which is | ||||
| 	 * useful if a consumer needs to traverse the list in a thread | ||||
| 	 * safe manner. | ||||
| 	 */ | ||||
| 	kmutex_t	mls_lock; | ||||
| 	/*
 | ||||
| 	 * The actual list object containing all objects in this sublist. | ||||
| 	 */ | ||||
| 	list_t		mls_list; | ||||
| 	/*
 | ||||
| 	 * Pad to cache line, in an effort to try and prevent cache line | ||||
| 	 * contention. | ||||
| 	 */ | ||||
| } ____cacheline_aligned; | ||||
| 
 | ||||
| struct multilist { | ||||
| 	/*
 | ||||
| 	 * This is used to get to the multilist_node_t structure given | ||||
| 	 * the void *object contained on the list. | ||||
| 	 */ | ||||
| 	size_t				ml_offset; | ||||
| 	/*
 | ||||
| 	 * The number of sublists used internally by this multilist. | ||||
| 	 */ | ||||
| 	uint64_t			ml_num_sublists; | ||||
| 	/*
 | ||||
| 	 * The array of pointers to the actual sublists. | ||||
| 	 */ | ||||
| 	multilist_sublist_t		*ml_sublists; | ||||
| 	/*
 | ||||
| 	 * Pointer to function which determines the sublist to use | ||||
| 	 * when inserting and removing objects from this multilist. | ||||
| 	 * Please see the comment above multilist_create for details. | ||||
| 	 */ | ||||
| 	multilist_sublist_index_func_t	*ml_index_func; | ||||
| }; | ||||
| 
 | ||||
| void multilist_destroy(multilist_t *); | ||||
| void multilist_create(multilist_t *, size_t, size_t, unsigned int, | ||||
|     multilist_sublist_index_func_t *); | ||||
| 
 | ||||
| void multilist_insert(multilist_t *, void *); | ||||
| void multilist_remove(multilist_t *, void *); | ||||
| int  multilist_is_empty(multilist_t *); | ||||
| 
 | ||||
| unsigned int multilist_get_num_sublists(multilist_t *); | ||||
| unsigned int multilist_get_random_index(multilist_t *); | ||||
| 
 | ||||
| multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); | ||||
| void multilist_sublist_unlock(multilist_sublist_t *); | ||||
| 
 | ||||
| void multilist_sublist_insert_head(multilist_sublist_t *, void *); | ||||
| void multilist_sublist_insert_tail(multilist_sublist_t *, void *); | ||||
| void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); | ||||
| void multilist_sublist_remove(multilist_sublist_t *, void *); | ||||
| 
 | ||||
| void *multilist_sublist_head(multilist_sublist_t *); | ||||
| void *multilist_sublist_tail(multilist_sublist_t *); | ||||
| void *multilist_sublist_next(multilist_sublist_t *, void *); | ||||
| void *multilist_sublist_prev(multilist_sublist_t *, void *); | ||||
| 
 | ||||
| void multilist_link_init(multilist_node_t *); | ||||
| int  multilist_link_active(multilist_node_t *); | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| #endif /* _SYS_MULTILIST_H */ | ||||
| @ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, | ||||
| 	TP_STRUCT__entry( | ||||
| 	    __array(uint64_t,		hdr_dva_word, 2) | ||||
| 	    __field(uint64_t,		hdr_birth) | ||||
| 	    __field(uint64_t,		hdr_cksum0) | ||||
| 	    __field(uint32_t,		hdr_flags) | ||||
| 	    __field(uint32_t,		hdr_datacnt) | ||||
| 	    __field(arc_buf_contents_t,	hdr_type) | ||||
| @ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, | ||||
| 	    __entry->hdr_dva_word[0]	= ab->b_dva.dva_word[0]; | ||||
| 	    __entry->hdr_dva_word[1]	= ab->b_dva.dva_word[1]; | ||||
| 	    __entry->hdr_birth		= ab->b_birth; | ||||
| 	    __entry->hdr_cksum0		= ab->b_cksum0; | ||||
| 	    __entry->hdr_flags		= ab->b_flags; | ||||
| 	    __entry->hdr_datacnt	= ab->b_datacnt; | ||||
| 	    __entry->hdr_type		= ab->b_type; | ||||
| 	    __entry->hdr_datacnt	= ab->b_l1hdr.b_datacnt; | ||||
| 	    __entry->hdr_size		= ab->b_size; | ||||
| 	    __entry->hdr_spa		= ab->b_spa; | ||||
| 	    __entry->hdr_state_type	= ab->b_state->arcs_state; | ||||
| 	    __entry->hdr_access		= ab->b_arc_access; | ||||
| 	    __entry->hdr_mru_hits	= ab->b_mru_hits; | ||||
| 	    __entry->hdr_mru_ghost_hits	= ab->b_mru_ghost_hits; | ||||
| 	    __entry->hdr_mfu_hits	= ab->b_mfu_hits; | ||||
| 	    __entry->hdr_mfu_ghost_hits	= ab->b_mfu_ghost_hits; | ||||
| 	    __entry->hdr_l2_hits	= ab->b_l2_hits; | ||||
| 	    __entry->hdr_refcount	= ab->b_refcnt.rc_count; | ||||
| 	    __entry->hdr_state_type	= ab->b_l1hdr.b_state->arcs_state; | ||||
| 	    __entry->hdr_access		= ab->b_l1hdr.b_arc_access; | ||||
| 	    __entry->hdr_mru_hits	= ab->b_l1hdr.b_mru_hits; | ||||
| 	    __entry->hdr_mru_ghost_hits	= ab->b_l1hdr.b_mru_ghost_hits; | ||||
| 	    __entry->hdr_mfu_hits	= ab->b_l1hdr.b_mfu_hits; | ||||
| 	    __entry->hdr_mfu_ghost_hits	= ab->b_l1hdr.b_mfu_ghost_hits; | ||||
| 	    __entry->hdr_l2_hits	= ab->b_l1hdr.b_l2_hits; | ||||
| 	    __entry->hdr_refcount	= ab->b_l1hdr.b_refcnt.rc_count; | ||||
| 	), | ||||
| 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " | ||||
| 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " | ||||
| 	    "flags 0x%x datacnt %u type %u size %llu spa %llu " | ||||
| 	    "state_type %u access %lu mru_hits %u mru_ghost_hits %u " | ||||
| 	    "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", | ||||
| 	    __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], | ||||
| 	    __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, | ||||
| 	    __entry->hdr_birth, __entry->hdr_flags, | ||||
| 	    __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, | ||||
| 	    __entry->hdr_spa, __entry->hdr_state_type, | ||||
| 	    __entry->hdr_access, __entry->hdr_mru_hits, | ||||
| @ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, | ||||
| 	TP_STRUCT__entry( | ||||
| 	    __array(uint64_t,		hdr_dva_word, 2) | ||||
| 	    __field(uint64_t,		hdr_birth) | ||||
| 	    __field(uint64_t,		hdr_cksum0) | ||||
| 	    __field(uint32_t,		hdr_flags) | ||||
| 	    __field(uint32_t,		hdr_datacnt) | ||||
| 	    __field(arc_buf_contents_t,	hdr_type) | ||||
| @ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, | ||||
| 	    __entry->hdr_dva_word[0]	= hdr->b_dva.dva_word[0]; | ||||
| 	    __entry->hdr_dva_word[1]	= hdr->b_dva.dva_word[1]; | ||||
| 	    __entry->hdr_birth		= hdr->b_birth; | ||||
| 	    __entry->hdr_cksum0		= hdr->b_cksum0; | ||||
| 	    __entry->hdr_flags		= hdr->b_flags; | ||||
| 	    __entry->hdr_datacnt	= hdr->b_datacnt; | ||||
| 	    __entry->hdr_type		= hdr->b_type; | ||||
| 	    __entry->hdr_datacnt	= hdr->b_l1hdr.b_datacnt; | ||||
| 	    __entry->hdr_size		= hdr->b_size; | ||||
| 	    __entry->hdr_spa		= hdr->b_spa; | ||||
| 	    __entry->hdr_state_type	= hdr->b_state->arcs_state; | ||||
| 	    __entry->hdr_access		= hdr->b_arc_access; | ||||
| 	    __entry->hdr_mru_hits	= hdr->b_mru_hits; | ||||
| 	    __entry->hdr_mru_ghost_hits	= hdr->b_mru_ghost_hits; | ||||
| 	    __entry->hdr_mfu_hits	= hdr->b_mfu_hits; | ||||
| 	    __entry->hdr_mfu_ghost_hits	= hdr->b_mfu_ghost_hits; | ||||
| 	    __entry->hdr_l2_hits	= hdr->b_l2_hits; | ||||
| 	    __entry->hdr_refcount	= hdr->b_refcnt.rc_count; | ||||
| 	    __entry->hdr_state_type	= hdr->b_l1hdr.b_state->arcs_state; | ||||
| 	    __entry->hdr_access		= hdr->b_l1hdr.b_arc_access; | ||||
| 	    __entry->hdr_mru_hits	= hdr->b_l1hdr.b_mru_hits; | ||||
| 	    __entry->hdr_mru_ghost_hits	= hdr->b_l1hdr.b_mru_ghost_hits; | ||||
| 	    __entry->hdr_mfu_hits	= hdr->b_l1hdr.b_mfu_hits; | ||||
| 	    __entry->hdr_mfu_ghost_hits	= hdr->b_l1hdr.b_mfu_ghost_hits; | ||||
| 	    __entry->hdr_l2_hits	= hdr->b_l1hdr.b_l2_hits; | ||||
| 	    __entry->hdr_refcount	= hdr->b_l1hdr.b_refcnt.rc_count; | ||||
| 
 | ||||
| 	    __entry->bp_dva0[0]		= bp->blk_dva[0].dva_word[0]; | ||||
| 	    __entry->bp_dva0[1]		= bp->blk_dva[0].dva_word[1]; | ||||
| @ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, | ||||
| 	    __entry->zb_level		= zb->zb_level; | ||||
| 	    __entry->zb_blkid		= zb->zb_blkid; | ||||
| 	), | ||||
| 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " | ||||
| 	    "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " | ||||
| 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " | ||||
| 	    "flags 0x%x datacnt %u size %llu spa %llu state_type %u " | ||||
| 	    "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " | ||||
| 	    "mfu_ghost_hits %u l2_hits %u refcount %lli } " | ||||
| 	    "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " | ||||
| @ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, | ||||
| 	    "lsize %llu } zb { objset %llu object %llu level %lli " | ||||
| 	    "blkid %llu }", | ||||
| 	    __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], | ||||
| 	    __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, | ||||
| 	    __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, | ||||
| 	    __entry->hdr_birth, __entry->hdr_flags, | ||||
| 	    __entry->hdr_datacnt, __entry->hdr_size, | ||||
| 	    __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, | ||||
| 	    __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, | ||||
| 	    __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, | ||||
|  | ||||
							
								
								
									
										76
									
								
								include/sys/trace_multilist.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								include/sys/trace_multilist.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | ||||
| /*
 | ||||
|  * CDDL HEADER START | ||||
|  * | ||||
|  * The contents of this file are subject to the terms of the | ||||
|  * Common Development and Distribution License (the "License"). | ||||
|  * You may not use this file except in compliance with the License. | ||||
|  * | ||||
|  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | ||||
|  * or http://www.opensolaris.org/os/licensing.
 | ||||
|  * See the License for the specific language governing permissions | ||||
|  * and limitations under the License. | ||||
|  * | ||||
|  * When distributing Covered Code, include this CDDL HEADER in each | ||||
|  * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | ||||
|  * If applicable, add the following below this CDDL HEADER, with the | ||||
|  * fields enclosed by brackets "[]" replaced with your own identifying | ||||
|  * information: Portions Copyright [yyyy] [name of copyright owner] | ||||
|  * | ||||
|  * CDDL HEADER END | ||||
|  */ | ||||
| 
 | ||||
| #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) | ||||
| 
 | ||||
| #undef TRACE_SYSTEM | ||||
| #define	TRACE_SYSTEM zfs | ||||
| 
 | ||||
| #if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ) | ||||
| #define	_TRACE_MULTILIST_H | ||||
| 
 | ||||
| #include <linux/tracepoint.h> | ||||
| #include <sys/types.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Generic support for three argument tracepoints of the form: | ||||
|  * | ||||
|  * DTRACE_PROBE3(..., | ||||
|  *     multilist_t *, ..., | ||||
|  *     unsigned int, ..., | ||||
|  *     void *, ...); | ||||
|  */ | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, | ||||
| 	TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), | ||||
| 	TP_ARGS(ml, sublist_idx, obj), | ||||
| 	TP_STRUCT__entry( | ||||
| 	    __field(size_t,		ml_offset) | ||||
| 	    __field(uint64_t,		ml_num_sublists) | ||||
| 
 | ||||
| 	    __field(unsigned int,	sublist_idx) | ||||
| 	), | ||||
| 	TP_fast_assign( | ||||
| 	    __entry->ml_offset		= ml->ml_offset; | ||||
| 	    __entry->ml_num_sublists	= ml->ml_num_sublists; | ||||
| 
 | ||||
| 	    __entry->sublist_idx	= sublist_idx; | ||||
| 	), | ||||
| 	TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", | ||||
| 	    __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) | ||||
| ); | ||||
| 
 | ||||
| #define	DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ | ||||
| DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ | ||||
| 	TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \ | ||||
| 	TP_ARGS(ml, sublist_idx, obj)) | ||||
| DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert); | ||||
| DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); | ||||
| 
 | ||||
| #endif /* _TRACE_MULTILIST_H */ | ||||
| 
 | ||||
| #undef TRACE_INCLUDE_PATH | ||||
| #undef TRACE_INCLUDE_FILE | ||||
| #define	TRACE_INCLUDE_PATH sys | ||||
| #define	TRACE_INCLUDE_FILE trace_multilist | ||||
| #include <trace/define_trace.h> | ||||
| 
 | ||||
| #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ | ||||
| @ -468,6 +468,7 @@ extern void	taskq_init_ent(taskq_ent_t *); | ||||
| extern void	taskq_destroy(taskq_t *); | ||||
| extern void	taskq_wait(taskq_t *); | ||||
| extern void	taskq_wait_id(taskq_t *, taskqid_t); | ||||
| extern void	taskq_wait_outstanding(taskq_t *, taskqid_t); | ||||
| extern int	taskq_member(taskq_t *, kthread_t *); | ||||
| extern int	taskq_cancel_id(taskq_t *, taskqid_t); | ||||
| extern void	system_taskq_init(void); | ||||
| @ -609,6 +610,7 @@ extern void delay(clock_t ticks); | ||||
| 	} while (0); | ||||
| 
 | ||||
| #define	max_ncpus	64 | ||||
| #define	num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN)) | ||||
| 
 | ||||
| #define	minclsyspri	60 | ||||
| #define	maxclsyspri	99 | ||||
|  | ||||
| @ -55,6 +55,7 @@ libzpool_la_SOURCES = \ | ||||
| 	$(top_srcdir)/module/zfs/lzjb.c \
 | ||||
| 	$(top_srcdir)/module/zfs/lz4.c \
 | ||||
| 	$(top_srcdir)/module/zfs/metaslab.c \
 | ||||
| 	$(top_srcdir)/module/zfs/multilist.c \
 | ||||
| 	$(top_srcdir)/module/zfs/range_tree.c \
 | ||||
| 	$(top_srcdir)/module/zfs/refcount.c \
 | ||||
| 	$(top_srcdir)/module/zfs/rrwlock.c \
 | ||||
|  | ||||
| @ -220,6 +220,12 @@ taskq_wait_id(taskq_t *tq, taskqid_t id) | ||||
| 	taskq_wait(tq); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| taskq_wait_outstanding(taskq_t *tq, taskqid_t id) | ||||
| { | ||||
| 	taskq_wait(tq); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| taskq_thread(void *arg) | ||||
| { | ||||
|  | ||||
| @ -347,6 +347,19 @@ increased to reduce the memory footprint. | ||||
| Default value: \fB8192\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_arc_evict_batch_limit\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| Number ARC headers to evict per sub-list before proceding to another sub-list. | ||||
| This batch-style operation prevents entire sub-lists from being evicted at once | ||||
| but comes at a cost of additional unlocking and locking. | ||||
| .sp | ||||
| Default value: \fB10\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| @ -395,6 +408,19 @@ for meta data. | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_arc_meta_min\fR (ulong) | ||||
| .ad | ||||
| .RS 12n | ||||
| The minimum allowed size in bytes that meta data buffers may consume in | ||||
| the ARC.  This value defaults to 0 which disables a floor on the amount | ||||
| of the ARC devoted meta data. | ||||
| .sp | ||||
| Default value: \fB0\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| @ -447,6 +473,40 @@ Min life of prefetch block | ||||
| Default value: \fB100\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_arc_num_sublists_per_state\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| To allow more fine-grained locking, each ARC state contains a series | ||||
| of lists for both data and meta data objects.  Locking is performed at | ||||
| the level of these "sub-lists".  This parameters controls the number of | ||||
| sub-lists per ARC state. | ||||
| .sp | ||||
| Default value: 1 or the number of on-online CPUs, whichever is greater | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
| \fBzfs_arc_overflow_shift\fR (int) | ||||
| .ad | ||||
| .RS 12n | ||||
| The ARC size is considered to be overflowing if it exceeds the current | ||||
| ARC target size (arc_c) by a threshold determined by this parameter. | ||||
| The threshold is calculated as a fraction of arc_c using the formula | ||||
| "arc_c >> \fBzfs_arc_overflow_shift\fR". | ||||
| 
 | ||||
| The default value of 8 causes the ARC to be considered to be overflowing | ||||
| if it exceeds the target size by 1/256th (0.3%) of the target size. | ||||
| 
 | ||||
| When the ARC is overflowing, new buffer allocations are stalled until | ||||
| the reclaim thread catches up and the overflow condition no longer exists. | ||||
| .sp | ||||
| Default value: \fB8\fR. | ||||
| .RE | ||||
| 
 | ||||
| .sp | ||||
| .ne 2 | ||||
| .na | ||||
|  | ||||
| @ -37,6 +37,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o | ||||
| $(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o | ||||
|  | ||||
							
								
								
									
										3541
									
								
								module/zfs/arc.c
									
									
									
									
									
								
							
							
						
						
									
										3541
									
								
								module/zfs/arc.c
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -653,7 +653,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) | ||||
| { | ||||
| 	dnode_t *dn; | ||||
| 	zbookmark_phys_t zb; | ||||
| 	uint32_t aflags = ARC_NOWAIT; | ||||
| 	uint32_t aflags = ARC_FLAG_NOWAIT; | ||||
| 	int err; | ||||
| 
 | ||||
| 	DB_DNODE_ENTER(db); | ||||
| @ -707,9 +707,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) | ||||
| 	mutex_exit(&db->db_mtx); | ||||
| 
 | ||||
| 	if (DBUF_IS_L2CACHEABLE(db)) | ||||
| 		aflags |= ARC_L2CACHE; | ||||
| 		aflags |= ARC_FLAG_L2CACHE; | ||||
| 	if (DBUF_IS_L2COMPRESSIBLE(db)) | ||||
| 		aflags |= ARC_L2COMPRESS; | ||||
| 		aflags |= ARC_FLAG_L2COMPRESS; | ||||
| 
 | ||||
| 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? | ||||
| 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, | ||||
| @ -721,7 +721,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) | ||||
| 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, | ||||
| 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, | ||||
| 	    &aflags, &zb); | ||||
| 	if (aflags & ARC_CACHED) | ||||
| 	if (aflags & ARC_FLAG_CACHED) | ||||
| 		*flags |= DB_RF_CACHED; | ||||
| 
 | ||||
| 	return (SET_ERROR(err)); | ||||
| @ -2028,7 +2028,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) | ||||
| 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { | ||||
| 		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { | ||||
| 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | ||||
| 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; | ||||
| 			arc_flags_t aflags = | ||||
| 			    ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||
| 			zbookmark_phys_t zb; | ||||
| 
 | ||||
| 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, | ||||
|  | ||||
| @ -48,12 +48,12 @@ dbuf_stats_hash_table_headers(char *buf, size_t size) | ||||
| 	(void) snprintf(buf, size, | ||||
| 	    "%-88s | %-124s | %s\n" | ||||
| 	    "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " | ||||
| 	    "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " | ||||
| 	    "%-5s %-5s %-8s %-6s %-8s %-12s " | ||||
| 	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " | ||||
| 	    "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", | ||||
| 	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", | ||||
| 	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", | ||||
| 	    "atype", "index", "flags", "count", "asize", "access", | ||||
| 	    "atype", "flags", "count", "asize", "access", | ||||
| 	    "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", | ||||
| 	    "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", | ||||
| 	    "bsize", "lvls", "dholds", "blocks", "dsize"); | ||||
| @ -77,7 +77,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) | ||||
| 
 | ||||
| 	nwritten = snprintf(buf, size, | ||||
| 	    "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " | ||||
| 	    "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " | ||||
| 	    "%-5d %-5d 0x%-6x %-6lu %-8llu %-12llu " | ||||
| 	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " | ||||
| 	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", | ||||
| 	    /* dmu_buf_impl_t */ | ||||
| @ -94,7 +94,6 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) | ||||
| 	    /* arc_buf_info_t */ | ||||
| 	    abi.abi_state_type, | ||||
| 	    abi.abi_state_contents, | ||||
| 	    (longlong_t)abi.abi_state_index, | ||||
| 	    abi.abi_flags, | ||||
| 	    (ulong_t)abi.abi_datacnt, | ||||
| 	    (u_longlong_t)abi.abi_size, | ||||
|  | ||||
| @ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | ||||
| 	} else if (zb->zb_level == 0) { | ||||
| 		dnode_phys_t *blk; | ||||
| 		arc_buf_t *abuf; | ||||
| 		uint32_t aflags = ARC_WAIT; | ||||
| 		arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 		int blksz = BP_GET_LSIZE(bp); | ||||
| 		int i; | ||||
| 
 | ||||
|  | ||||
| @ -306,15 +306,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, | ||||
| 	os->os_spa = spa; | ||||
| 	os->os_rootbp = bp; | ||||
| 	if (!BP_IS_HOLE(os->os_rootbp)) { | ||||
| 		uint32_t aflags = ARC_WAIT; | ||||
| 		arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 		zbookmark_phys_t zb; | ||||
| 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, | ||||
| 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); | ||||
| 
 | ||||
| 		if (DMU_OS_IS_L2CACHEABLE(os)) | ||||
| 			aflags |= ARC_L2CACHE; | ||||
| 			aflags |= ARC_FLAG_L2CACHE; | ||||
| 		if (DMU_OS_IS_L2COMPRESSIBLE(os)) | ||||
| 			aflags |= ARC_L2COMPRESS; | ||||
| 			aflags |= ARC_FLAG_L2COMPRESS; | ||||
| 
 | ||||
| 		dprintf_bp(os->os_rootbp, "reading %s", ""); | ||||
| 		err = arc_read(NULL, spa, os->os_rootbp, | ||||
|  | ||||
| @ -486,7 +486,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | ||||
| 		dnode_phys_t *blk; | ||||
| 		int i; | ||||
| 		int blksz = BP_GET_LSIZE(bp); | ||||
| 		uint32_t aflags = ARC_WAIT; | ||||
| 		arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 		arc_buf_t *abuf; | ||||
| 
 | ||||
| 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, | ||||
| @ -504,7 +504,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | ||||
| 		} | ||||
| 		(void) arc_buf_remove_ref(abuf, &abuf); | ||||
| 	} else if (type == DMU_OT_SA) { | ||||
| 		uint32_t aflags = ARC_WAIT; | ||||
| 		arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 		arc_buf_t *abuf; | ||||
| 		int blksz = BP_GET_LSIZE(bp); | ||||
| 
 | ||||
| @ -521,8 +521,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | ||||
| 		err = dump_write_embedded(dsp, zb->zb_object, | ||||
| 		    zb->zb_blkid * blksz, blksz, bp); | ||||
| 	} else { /* it's a level-0 block of a regular object */ | ||||
| 		uint32_t aflags = ARC_WAIT; | ||||
| 		uint64_t offset; | ||||
| 		arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 		arc_buf_t *abuf; | ||||
| 		int blksz = BP_GET_LSIZE(bp); | ||||
| 
 | ||||
|  | ||||
| @ -177,7 +177,7 @@ static void | ||||
| traverse_prefetch_metadata(traverse_data_t *td, | ||||
|     const blkptr_t *bp, const zbookmark_phys_t *zb) | ||||
| { | ||||
| 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; | ||||
| 	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||
| 
 | ||||
| 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) | ||||
| 		return; | ||||
| @ -273,7 +273,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, | ||||
| 	} | ||||
| 
 | ||||
| 	if (BP_GET_LEVEL(bp) > 0) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		uint32_t flags = ARC_FLAG_WAIT; | ||||
| 		int32_t i; | ||||
| 		int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; | ||||
| 		zbookmark_phys_t *czb; | ||||
| @ -307,7 +307,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, | ||||
| 		kmem_free(czb, sizeof (zbookmark_phys_t)); | ||||
| 
 | ||||
| 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		uint32_t flags = ARC_FLAG_WAIT; | ||||
| 		int32_t i; | ||||
| 		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; | ||||
| 		dnode_phys_t *cdnp; | ||||
| @ -331,7 +331,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, | ||||
| 				break; | ||||
| 		} | ||||
| 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		arc_flags_t flags = ARC_FLAG_WAIT; | ||||
| 		objset_phys_t *osp; | ||||
| 		dnode_phys_t *mdnp, *gdnp, *udnp; | ||||
| 
 | ||||
| @ -448,7 +448,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | ||||
|     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) | ||||
| { | ||||
| 	prefetch_data_t *pfd = arg; | ||||
| 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; | ||||
| 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||
| 
 | ||||
| 	ASSERT(pfd->pd_bytes_fetched >= 0); | ||||
| 	if (pfd->pd_cancel) | ||||
| @ -545,7 +545,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, | ||||
| 
 | ||||
| 	/* See comment on ZIL traversal in dsl_scan_visitds. */ | ||||
| 	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		uint32_t flags = ARC_FLAG_WAIT; | ||||
| 		objset_phys_t *osp; | ||||
| 		arc_buf_t *buf; | ||||
| 
 | ||||
|  | ||||
| @ -317,7 +317,14 @@ dsl_pool_close(dsl_pool_t *dp) | ||||
| 	txg_list_destroy(&dp->dp_sync_tasks); | ||||
| 	txg_list_destroy(&dp->dp_dirty_dirs); | ||||
| 
 | ||||
| 	arc_flush(dp->dp_spa); | ||||
| 	/*
 | ||||
| 	 * We can't set retry to TRUE since we're explicitly specifying | ||||
| 	 * a spa to flush. This is good enough; any missed buffers for | ||||
| 	 * this spa won't cause trouble, and they'll eventually fall | ||||
| 	 * out of the ARC just like any other unused buffer. | ||||
| 	 */ | ||||
| 	arc_flush(dp->dp_spa, FALSE); | ||||
| 
 | ||||
| 	txg_fini(dp); | ||||
| 	dsl_scan_fini(dp); | ||||
| 	dmu_buf_user_evict_wait(); | ||||
|  | ||||
| @ -590,7 +590,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, | ||||
|     uint64_t objset, uint64_t object, uint64_t blkid) | ||||
| { | ||||
| 	zbookmark_phys_t czb; | ||||
| 	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; | ||||
| 	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; | ||||
| 
 | ||||
| 	if (zfs_no_scrub_prefetch) | ||||
| 		return; | ||||
| @ -655,7 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, | ||||
| 	int err; | ||||
| 
 | ||||
| 	if (BP_GET_LEVEL(bp) > 0) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		arc_flags_t flags = ARC_FLAG_WAIT; | ||||
| 		int i; | ||||
| 		blkptr_t *cbp; | ||||
| 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; | ||||
| @ -682,7 +682,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, | ||||
| 		} | ||||
| 		(void) arc_buf_remove_ref(buf, &buf); | ||||
| 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		arc_flags_t flags = ARC_FLAG_WAIT; | ||||
| 		dnode_phys_t *cdnp; | ||||
| 		int i, j; | ||||
| 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; | ||||
| @ -708,7 +708,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, | ||||
| 
 | ||||
| 		(void) arc_buf_remove_ref(buf, &buf); | ||||
| 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { | ||||
| 		uint32_t flags = ARC_WAIT; | ||||
| 		arc_flags_t flags = ARC_FLAG_WAIT; | ||||
| 		objset_phys_t *osp; | ||||
| 		arc_buf_t *buf; | ||||
| 
 | ||||
|  | ||||
| @ -556,7 +556,7 @@ metaslab_group_passivate(metaslab_group_t *mg) | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	taskq_wait(mg->mg_taskq); | ||||
| 	taskq_wait_outstanding(mg->mg_taskq, 0); | ||||
| 	metaslab_group_alloc_update(mg); | ||||
| 
 | ||||
| 	mgprev = mg->mg_prev; | ||||
| @ -1596,7 +1596,7 @@ metaslab_group_preload(metaslab_group_t *mg) | ||||
| 	int m = 0; | ||||
| 
 | ||||
| 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) { | ||||
| 		taskq_wait(mg->mg_taskq); | ||||
| 		taskq_wait_outstanding(mg->mg_taskq, 0); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										375
									
								
								module/zfs/multilist.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										375
									
								
								module/zfs/multilist.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,375 @@ | ||||
| /*
 | ||||
|  * CDDL HEADER START | ||||
|  * | ||||
|  * This file and its contents are supplied under the terms of the | ||||
|  * Common Development and Distribution License ("CDDL"), version 1.0. | ||||
|  * You may only use this file in accordance with the terms of version | ||||
|  * 1.0 of the CDDL. | ||||
|  * | ||||
|  * A full copy of the text of the CDDL should have accompanied this | ||||
|  * source.  A copy of the CDDL is also available via the Internet at | ||||
|  * http://www.illumos.org/license/CDDL.
 | ||||
|  * | ||||
|  * CDDL HEADER END | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2013, 2014 by Delphix. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| #include <sys/zfs_context.h> | ||||
| #include <sys/multilist.h> | ||||
| #include <sys/trace_multilist.h> | ||||
| 
 | ||||
| /* needed for spa_get_random() */ | ||||
| #include <sys/spa.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Given the object contained on the list, return a pointer to the | ||||
|  * object's multilist_node_t structure it contains. | ||||
|  */ | ||||
| #ifdef DEBUG | ||||
| static multilist_node_t * | ||||
| multilist_d2l(multilist_t *ml, void *obj) | ||||
| { | ||||
| 	return ((multilist_node_t *)((char *)obj + ml->ml_offset)); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Initialize a new mutlilist using the parameters specified. | ||||
|  * | ||||
|  *  - 'size' denotes the size of the structure containing the | ||||
|  *     multilist_node_t. | ||||
|  *  - 'offset' denotes the byte offset of the mutlilist_node_t within | ||||
|  *     the structure that contains it. | ||||
|  *  - 'num' specifies the number of internal sublists to create. | ||||
|  *  - 'index_func' is used to determine which sublist to insert into | ||||
|  *     when the multilist_insert() function is called; as well as which | ||||
|  *     sublist to remove from when multilist_remove() is called. The | ||||
|  *     requirements this function must meet, are the following: | ||||
|  * | ||||
|  *      - It must always return the same value when called on the same | ||||
|  *        object (to ensure the object is removed from the list it was | ||||
|  *        inserted into). | ||||
|  * | ||||
|  *      - It must return a value in the range [0, number of sublists). | ||||
|  *        The multilist_get_num_sublists() function may be used to | ||||
|  *        determine the number of sublists in the multilist. | ||||
|  * | ||||
|  *     Also, in order to reduce internal contention between the sublists | ||||
|  *     during insertion and removal, this function should choose evenly | ||||
|  *     between all available sublists when inserting. This isn't a hard | ||||
|  *     requirement, but a general rule of thumb in order to garner the | ||||
|  *     best multi-threaded performance out of the data structure. | ||||
|  */ | ||||
| void | ||||
| multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, | ||||
|     multilist_sublist_index_func_t *index_func) | ||||
| { | ||||
| 	int i; | ||||
| 
 | ||||
| 	ASSERT3P(ml, !=, NULL); | ||||
| 	ASSERT3U(size, >, 0); | ||||
| 	ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); | ||||
| 	ASSERT3U(num, >, 0); | ||||
| 	ASSERT3P(index_func, !=, NULL); | ||||
| 
 | ||||
| 	ml->ml_offset = offset; | ||||
| 	ml->ml_num_sublists = num; | ||||
| 	ml->ml_index_func = index_func; | ||||
| 
 | ||||
| 	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * | ||||
| 	    ml->ml_num_sublists, KM_SLEEP); | ||||
| 
 | ||||
| 	ASSERT3P(ml->ml_sublists, !=, NULL); | ||||
| 
 | ||||
| 	for (i = 0; i < ml->ml_num_sublists; i++) { | ||||
| 		multilist_sublist_t *mls = &ml->ml_sublists[i]; | ||||
| 		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 		list_create(&mls->mls_list, size, offset); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Destroy the given multilist object, and free up any memory it holds. | ||||
|  */ | ||||
| void | ||||
| multilist_destroy(multilist_t *ml) | ||||
| { | ||||
| 	int i; | ||||
| 
 | ||||
| 	ASSERT(multilist_is_empty(ml)); | ||||
| 
 | ||||
| 	for (i = 0; i < ml->ml_num_sublists; i++) { | ||||
| 		multilist_sublist_t *mls = &ml->ml_sublists[i]; | ||||
| 
 | ||||
| 		ASSERT(list_is_empty(&mls->mls_list)); | ||||
| 
 | ||||
| 		list_destroy(&mls->mls_list); | ||||
| 		mutex_destroy(&mls->mls_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	ASSERT3P(ml->ml_sublists, !=, NULL); | ||||
| 	kmem_free(ml->ml_sublists, | ||||
| 	    sizeof (multilist_sublist_t) * ml->ml_num_sublists); | ||||
| 
 | ||||
| 	ml->ml_num_sublists = 0; | ||||
| 	ml->ml_offset = 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Insert the given object into the multilist. | ||||
|  * | ||||
|  * This function will insert the object specified into the sublist | ||||
|  * determined using the function given at multilist creation time. | ||||
|  * | ||||
|  * The sublist locks are automatically acquired if not already held, to | ||||
|  * ensure consistency when inserting and removing from multiple threads. | ||||
|  */ | ||||
| void | ||||
| multilist_insert(multilist_t *ml, void *obj) | ||||
| { | ||||
| 	unsigned int sublist_idx = ml->ml_index_func(ml, obj); | ||||
| 	multilist_sublist_t *mls; | ||||
| 	boolean_t need_lock; | ||||
| 
 | ||||
| 	DTRACE_PROBE3(multilist__insert, multilist_t *, ml, | ||||
| 	    unsigned int, sublist_idx, void *, obj); | ||||
| 
 | ||||
| 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists); | ||||
| 
 | ||||
| 	mls = &ml->ml_sublists[sublist_idx]; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Note: Callers may already hold the sublist lock by calling | ||||
| 	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD() | ||||
| 	 * returning TRUE if and only if the current thread holds the | ||||
| 	 * lock.  While it's a little ugly to make the lock recursive in | ||||
| 	 * this way, it works and allows the calling code to be much | ||||
| 	 * simpler -- otherwise it would have to pass around a flag | ||||
| 	 * indicating that it already has the lock. | ||||
| 	 */ | ||||
| 	need_lock = !MUTEX_HELD(&mls->mls_lock); | ||||
| 
 | ||||
| 	if (need_lock) | ||||
| 		mutex_enter(&mls->mls_lock); | ||||
| 
 | ||||
| 	ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); | ||||
| 
 | ||||
| 	multilist_sublist_insert_head(mls, obj); | ||||
| 
 | ||||
| 	if (need_lock) | ||||
| 		mutex_exit(&mls->mls_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Remove the given object from the multilist. | ||||
|  * | ||||
|  * This function will remove the object specified from the sublist | ||||
|  * determined using the function given at multilist creation time. | ||||
|  * | ||||
|  * The necessary sublist locks are automatically acquired, to ensure | ||||
|  * consistency when inserting and removing from multiple threads. | ||||
|  */ | ||||
| void | ||||
| multilist_remove(multilist_t *ml, void *obj) | ||||
| { | ||||
| 	unsigned int sublist_idx = ml->ml_index_func(ml, obj); | ||||
| 	multilist_sublist_t *mls; | ||||
| 	boolean_t need_lock; | ||||
| 
 | ||||
| 	DTRACE_PROBE3(multilist__remove, multilist_t *, ml, | ||||
| 	    unsigned int, sublist_idx, void *, obj); | ||||
| 
 | ||||
| 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists); | ||||
| 
 | ||||
| 	mls = &ml->ml_sublists[sublist_idx]; | ||||
| 	/* See comment in multilist_insert(). */ | ||||
| 	need_lock = !MUTEX_HELD(&mls->mls_lock); | ||||
| 
 | ||||
| 	if (need_lock) | ||||
| 		mutex_enter(&mls->mls_lock); | ||||
| 
 | ||||
| 	ASSERT(multilist_link_active(multilist_d2l(ml, obj))); | ||||
| 
 | ||||
| 	multilist_sublist_remove(mls, obj); | ||||
| 
 | ||||
| 	if (need_lock) | ||||
| 		mutex_exit(&mls->mls_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check to see if this multilist object is empty. | ||||
|  * | ||||
|  * This will return TRUE if it finds all of the sublists of this | ||||
|  * multilist to be empty, and FALSE otherwise. Each sublist lock will be | ||||
|  * automatically acquired as necessary. | ||||
|  * | ||||
|  * If concurrent insertions and removals are occurring, the semantics | ||||
|  * of this function become a little fuzzy. Instead of locking all | ||||
|  * sublists for the entire call time of the function, each sublist is | ||||
|  * only locked as it is individually checked for emptiness. Thus, it's | ||||
|  * possible for this function to return TRUE with non-empty sublists at | ||||
|  * the time the function returns. This would be due to another thread | ||||
|  * inserting into a given sublist, after that specific sublist was check | ||||
|  * and deemed empty, but before all sublists have been checked. | ||||
|  */ | ||||
| int | ||||
| multilist_is_empty(multilist_t *ml) | ||||
| { | ||||
| 	int i; | ||||
| 
 | ||||
| 	for (i = 0; i < ml->ml_num_sublists; i++) { | ||||
| 		multilist_sublist_t *mls = &ml->ml_sublists[i]; | ||||
| 		/* See comment in multilist_insert(). */ | ||||
| 		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); | ||||
| 
 | ||||
| 		if (need_lock) | ||||
| 			mutex_enter(&mls->mls_lock); | ||||
| 
 | ||||
| 		if (!list_is_empty(&mls->mls_list)) { | ||||
| 			if (need_lock) | ||||
| 				mutex_exit(&mls->mls_lock); | ||||
| 
 | ||||
| 			return (FALSE); | ||||
| 		} | ||||
| 
 | ||||
| 		if (need_lock) | ||||
| 			mutex_exit(&mls->mls_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	return (TRUE); | ||||
| } | ||||
| 
 | ||||
| /* Return the number of sublists composing this multilist */ | ||||
| unsigned int | ||||
| multilist_get_num_sublists(multilist_t *ml) | ||||
| { | ||||
| 	return (ml->ml_num_sublists); | ||||
| } | ||||
| 
 | ||||
| /* Return a randomly selected, valid sublist index for this multilist */ | ||||
| unsigned int | ||||
| multilist_get_random_index(multilist_t *ml) | ||||
| { | ||||
| 	return (spa_get_random(ml->ml_num_sublists)); | ||||
| } | ||||
| 
 | ||||
| /* Lock and return the sublist specified at the given index */ | ||||
| multilist_sublist_t * | ||||
| multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) | ||||
| { | ||||
| 	multilist_sublist_t *mls; | ||||
| 
 | ||||
| 	ASSERT3U(sublist_idx, <, ml->ml_num_sublists); | ||||
| 	mls = &ml->ml_sublists[sublist_idx]; | ||||
| 	mutex_enter(&mls->mls_lock); | ||||
| 
 | ||||
| 	return (mls); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| multilist_sublist_unlock(multilist_sublist_t *mls) | ||||
| { | ||||
| 	mutex_exit(&mls->mls_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * We're allowing any object to be inserted into this specific sublist, | ||||
|  * but this can lead to trouble if multilist_remove() is called to | ||||
|  * remove this object. Specifically, if calling ml_index_func on this | ||||
|  * object returns an index for sublist different than what is passed as | ||||
|  * a parameter here, any call to multilist_remove() with this newly | ||||
|  * inserted object is undefined! (the call to multilist_remove() will | ||||
|  * remove the object from a list that it isn't contained in) | ||||
|  */ | ||||
| void | ||||
| multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	list_insert_head(&mls->mls_list, obj); | ||||
| } | ||||
| 
 | ||||
| /* please see comment above multilist_sublist_insert_head */ | ||||
| void | ||||
| multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	list_insert_tail(&mls->mls_list, obj); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Move the object one element forward in the list. | ||||
|  * | ||||
|  * This function will move the given object forward in the list (towards | ||||
|  * the head) by one object. So, in essence, it will swap its position in | ||||
|  * the list with its "prev" pointer. If the given object is already at the | ||||
|  * head of the list, it cannot be moved forward any more than it already | ||||
|  * is, so no action is taken. | ||||
|  * | ||||
|  * NOTE: This function **must not** remove any object from the list other | ||||
|  *       than the object given as the parameter. This is relied upon in | ||||
|  *       arc_evict_state_impl(). | ||||
|  */ | ||||
| void | ||||
| multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	void *prev = list_prev(&mls->mls_list, obj); | ||||
| 
 | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	ASSERT(!list_is_empty(&mls->mls_list)); | ||||
| 
 | ||||
| 	/* 'obj' must be at the head of the list, nothing to do */ | ||||
| 	if (prev == NULL) | ||||
| 		return; | ||||
| 
 | ||||
| 	list_remove(&mls->mls_list, obj); | ||||
| 	list_insert_before(&mls->mls_list, prev, obj); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| multilist_sublist_remove(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	list_remove(&mls->mls_list, obj); | ||||
| } | ||||
| 
 | ||||
| void * | ||||
| multilist_sublist_head(multilist_sublist_t *mls) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	return (list_head(&mls->mls_list)); | ||||
| } | ||||
| 
 | ||||
| void * | ||||
| multilist_sublist_tail(multilist_sublist_t *mls) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	return (list_tail(&mls->mls_list)); | ||||
| } | ||||
| 
 | ||||
| void * | ||||
| multilist_sublist_next(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	return (list_next(&mls->mls_list, obj)); | ||||
| } | ||||
| 
 | ||||
| void * | ||||
| multilist_sublist_prev(multilist_sublist_t *mls, void *obj) | ||||
| { | ||||
| 	ASSERT(MUTEX_HELD(&mls->mls_lock)); | ||||
| 	return (list_prev(&mls->mls_list, obj)); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| multilist_link_init(multilist_node_t *link) | ||||
| { | ||||
| 	list_link_init(link); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| multilist_link_active(multilist_node_t *link) | ||||
| { | ||||
| 	return (list_link_active(link)); | ||||
| } | ||||
| @ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) | ||||
| 	if (zfs_read_history == 0 && ssh->size == 0) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) | ||||
| 	if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) | ||||
| 		return; | ||||
| 
 | ||||
| 	srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); | ||||
|  | ||||
| @ -23,6 +23,7 @@ | ||||
|  * (and only one) C file, so this dummy file exists for that purpose. | ||||
|  */ | ||||
| 
 | ||||
| #include <sys/multilist.h> | ||||
| #include <sys/arc_impl.h> | ||||
| #include <sys/vdev_impl.h> | ||||
| #include <sys/zio.h> | ||||
| @ -31,6 +32,7 @@ | ||||
| #include <sys/dsl_dataset.h> | ||||
| #include <sys/dmu_tx.h> | ||||
| #include <sys/dnode.h> | ||||
| #include <sys/multilist.h> | ||||
| #include <sys/zfs_znode.h> | ||||
| #include <sys/zil_impl.h> | ||||
| #include <sys/zrlock.h> | ||||
| @ -42,6 +44,7 @@ | ||||
| #include <sys/trace_dbuf.h> | ||||
| #include <sys/trace_dmu.h> | ||||
| #include <sys/trace_dnode.h> | ||||
| #include <sys/trace_multilist.h> | ||||
| #include <sys/trace_txg.h> | ||||
| #include <sys/trace_zil.h> | ||||
| #include <sys/trace_zrlock.h> | ||||
|  | ||||
| @ -471,7 +471,7 @@ txg_wait_callbacks(dsl_pool_t *dp) | ||||
| 	tx_state_t *tx = &dp->dp_tx; | ||||
| 
 | ||||
| 	if (tx->tx_commit_cb_taskq != NULL) | ||||
| 		taskq_wait(tx->tx_commit_cb_taskq); | ||||
| 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
|  | ||||
| @ -1152,8 +1152,8 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) | ||||
| 		 */ | ||||
| 		int round = 0; | ||||
| 		while (zsb->z_nr_znodes > 0) { | ||||
| 			taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool( | ||||
| 			    zsb->z_os))); | ||||
| 			taskq_wait_outstanding(dsl_pool_iput_taskq( | ||||
| 			    dmu_objset_pool(zsb->z_os)), 0); | ||||
| 			if (++round > 1 && !unmounting) | ||||
| 				break; | ||||
| 		} | ||||
| @ -1740,7 +1740,7 @@ zfs_init(void) | ||||
| void | ||||
| zfs_fini(void) | ||||
| { | ||||
| 	taskq_wait(system_taskq); | ||||
| 	taskq_wait_outstanding(system_taskq, 0); | ||||
| 	unregister_filesystem(&zpl_fs_type); | ||||
| 	zfs_znode_fini(); | ||||
| 	zfsctl_fini(); | ||||
|  | ||||
| @ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, | ||||
|     char **end) | ||||
| { | ||||
| 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; | ||||
| 	uint32_t aflags = ARC_WAIT; | ||||
| 	arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 	arc_buf_t *abuf = NULL; | ||||
| 	zbookmark_phys_t zb; | ||||
| 	int error; | ||||
| @ -280,7 +280,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) | ||||
| { | ||||
| 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; | ||||
| 	const blkptr_t *bp = &lr->lr_blkptr; | ||||
| 	uint32_t aflags = ARC_WAIT; | ||||
| 	arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 	arc_buf_t *abuf = NULL; | ||||
| 	zbookmark_phys_t zb; | ||||
| 	int error; | ||||
|  | ||||
| @ -2241,7 +2241,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) | ||||
| 
 | ||||
| 		if (ddp->ddp_phys_birth != 0) { | ||||
| 			arc_buf_t *abuf = NULL; | ||||
| 			uint32_t aflags = ARC_WAIT; | ||||
| 			arc_flags_t aflags = ARC_FLAG_WAIT; | ||||
| 			blkptr_t blk = *zio->io_bp; | ||||
| 			int error; | ||||
| 
 | ||||
|  | ||||
| @ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) | ||||
| 	 * fault injection isn't a performance critical path. | ||||
| 	 */ | ||||
| 	if (flags & ZINJECT_FLUSH_ARC) | ||||
| 		arc_flush(NULL); | ||||
| 		/*
 | ||||
| 		 * We must use FALSE to ensure arc_flush returns, since | ||||
| 		 * we're not preventing concurrent ARC insertions. | ||||
| 		 */ | ||||
| 		arc_flush(NULL, FALSE); | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Brian Behlendorf
						Brian Behlendorf