mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	OpenZFS 7968 - multi-threaded spa_sync()
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: Matthew Ahrens <mahrens@delphix.com> spa_sync() iterates over all the dirty dnodes and processes each of them by calling dnode_sync(). If there are many dirty dnodes (e.g. because we created or removed a lot of files), the single thread of spa_sync() calling dnode_sync() can become a bottleneck. Additionally, if many dnodes are dirtied concurrently in open context (e.g. due to concurrent file creation), the os_lock will experience lock contention via dnode_setdirty(). The solution is to track dirty dnodes on a multilist_t, and for spa_sync() to use separate threads to process each of the sublists in the multilist. OpenZFS-issue: https://www.illumos.org/issues/7968 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/4a2a54c Closes #5752
This commit is contained in:
		
							parent
							
								
									a3478c0747
								
							
						
					
					
						commit
						64fc776208
					
				| @ -70,7 +70,7 @@ typedef struct arc_state { | ||||
| 	/*
 | ||||
| 	 * list of evictable buffers | ||||
| 	 */ | ||||
| 	multilist_t arcs_list[ARC_BUFC_NUMTYPES]; | ||||
| 	multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; | ||||
| 	/*
 | ||||
| 	 * total amount of evictable data in this state | ||||
| 	 */ | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| @ -113,7 +113,7 @@ struct objset { | ||||
| 	/* no lock needed: */ | ||||
| 	struct dmu_tx *os_synctx; /* XXX sketchy */ | ||||
| 	zil_header_t os_zil_header; | ||||
| 	list_t os_synced_dnodes; | ||||
| 	multilist_t *os_synced_dnodes; | ||||
| 	uint64_t os_flags; | ||||
| 	uint64_t os_freed_dnodes; | ||||
| 	boolean_t os_rescan_dnodes; | ||||
| @ -124,11 +124,13 @@ struct objset { | ||||
| 
 | ||||
| 	/* Protected by os_lock */ | ||||
| 	kmutex_t os_lock; | ||||
| 	list_t os_dirty_dnodes[TXG_SIZE]; | ||||
| 	list_t os_free_dnodes[TXG_SIZE]; | ||||
| 	multilist_t *os_dirty_dnodes[TXG_SIZE]; | ||||
| 	list_t os_dnodes; | ||||
| 	list_t os_downgraded_dbufs; | ||||
| 
 | ||||
| 	/* Protects changes to DMU_{USER,GROUP}USED_OBJECT */ | ||||
| 	kmutex_t os_userused_lock; | ||||
| 
 | ||||
| 	/* stuff we store for the user */ | ||||
| 	kmutex_t os_user_ptr_lock; | ||||
| 	void *os_user_ptr; | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -35,6 +35,7 @@ | ||||
| #include <sys/refcount.h> | ||||
| #include <sys/dmu_zfetch.h> | ||||
| #include <sys/zrlock.h> | ||||
| #include <sys/multilist.h> | ||||
| 
 | ||||
| #ifdef	__cplusplus | ||||
| extern "C" { | ||||
| @ -243,7 +244,7 @@ struct dnode { | ||||
| 	uint32_t dn_dbufs_count;	/* count of dn_dbufs */ | ||||
| 
 | ||||
| 	/* protected by os_lock: */ | ||||
| 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */ | ||||
| 	multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ | ||||
| 
 | ||||
| 	/* protected by dn_mtx: */ | ||||
| 	kmutex_t dn_mtx; | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2013 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -124,6 +124,7 @@ typedef struct dsl_pool { | ||||
| 	txg_list_t dp_dirty_zilogs; | ||||
| 	txg_list_t dp_dirty_dirs; | ||||
| 	txg_list_t dp_sync_tasks; | ||||
| 	taskq_t *dp_sync_taskq; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Protects administrative changes (properties, namespace) | ||||
|  | ||||
| @ -72,8 +72,7 @@ struct multilist { | ||||
| }; | ||||
| 
 | ||||
| void multilist_destroy(multilist_t *); | ||||
| void multilist_create(multilist_t *, size_t, size_t, | ||||
|     multilist_sublist_index_func_t *); | ||||
| multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); | ||||
| 
 | ||||
| void multilist_insert(multilist_t *, void *); | ||||
| void multilist_remove(multilist_t *, void *); | ||||
| @ -83,6 +82,7 @@ unsigned int multilist_get_num_sublists(multilist_t *); | ||||
| unsigned int multilist_get_random_index(multilist_t *); | ||||
| 
 | ||||
| multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); | ||||
| multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); | ||||
| void multilist_sublist_unlock(multilist_sublist_t *); | ||||
| 
 | ||||
| void multilist_sublist_insert_head(multilist_sublist_t *, void *); | ||||
|  | ||||
| @ -22,7 +22,7 @@ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright 2011 Nexenta Systems, Inc. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -454,7 +454,7 @@ struct zio { | ||||
| 	taskq_ent_t	io_tqent; | ||||
| }; | ||||
| 
 | ||||
| extern int zio_timestamp_compare(const void *, const void *); | ||||
| extern int zio_bookmark_compare(const void *, const void *); | ||||
| 
 | ||||
| extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, | ||||
|     zio_done_func_t *done, void *private, enum zio_flag flags); | ||||
|  | ||||
| @ -1927,7 +1927,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) | ||||
| 	    (state != arc_anon)) { | ||||
| 		/* We don't use the L2-only state list. */ | ||||
| 		if (state != arc_l2c_only) { | ||||
| 			multilist_remove(&state->arcs_list[arc_buf_type(hdr)], | ||||
| 			multilist_remove(state->arcs_list[arc_buf_type(hdr)], | ||||
| 			    hdr); | ||||
| 			arc_evictable_space_decrement(hdr, state); | ||||
| 		} | ||||
| @ -1957,7 +1957,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) | ||||
| 	 */ | ||||
| 	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && | ||||
| 	    (state != arc_anon)) { | ||||
| 		multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); | ||||
| 		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); | ||||
| 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); | ||||
| 		arc_evictable_space_increment(hdr, state); | ||||
| 	} | ||||
| @ -2059,7 +2059,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, | ||||
| 	if (refcnt == 0) { | ||||
| 		if (old_state != arc_anon && old_state != arc_l2c_only) { | ||||
| 			ASSERT(HDR_HAS_L1HDR(hdr)); | ||||
| 			multilist_remove(&old_state->arcs_list[buftype], hdr); | ||||
| 			multilist_remove(old_state->arcs_list[buftype], hdr); | ||||
| 
 | ||||
| 			if (GHOST_STATE(old_state)) { | ||||
| 				ASSERT0(bufcnt); | ||||
| @ -2076,7 +2076,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, | ||||
| 			 * beforehand. | ||||
| 			 */ | ||||
| 			ASSERT(HDR_HAS_L1HDR(hdr)); | ||||
| 			multilist_insert(&new_state->arcs_list[buftype], hdr); | ||||
| 			multilist_insert(new_state->arcs_list[buftype], hdr); | ||||
| 
 | ||||
| 			if (GHOST_STATE(new_state)) { | ||||
| 				ASSERT0(bufcnt); | ||||
| @ -2204,8 +2204,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, | ||||
| 	 * L2 headers should never be on the L2 state list since they don't | ||||
| 	 * have L1 headers allocated. | ||||
| 	 */ | ||||
| 	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && | ||||
| 	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); | ||||
| 	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && | ||||
| 	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| @ -3302,7 +3302,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, | ||||
|     arc_buf_contents_t type) | ||||
| { | ||||
| 	uint64_t total_evicted = 0; | ||||
| 	multilist_t *ml = &state->arcs_list[type]; | ||||
| 	multilist_t *ml = state->arcs_list[type]; | ||||
| 	int num_sublists; | ||||
| 	arc_buf_hdr_t **markers; | ||||
| 	int i; | ||||
| @ -3681,8 +3681,8 @@ arc_adjust_meta(void) | ||||
| static arc_buf_contents_t | ||||
| arc_adjust_type(arc_state_t *state) | ||||
| { | ||||
| 	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; | ||||
| 	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; | ||||
| 	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; | ||||
| 	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; | ||||
| 	int data_idx = multilist_get_random_index(data_ml); | ||||
| 	int meta_idx = multilist_get_random_index(meta_ml); | ||||
| 	multilist_sublist_t *data_mls; | ||||
| @ -6281,44 +6281,44 @@ arc_state_init(void) | ||||
| 	arc_mfu_ghost = &ARC_mfu_ghost; | ||||
| 	arc_l2c_only = &ARC_l2c_only; | ||||
| 
 | ||||
| 	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mru->arcs_list[ARC_BUFC_METADATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mru->arcs_list[ARC_BUFC_DATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mfu->arcs_list[ARC_BUFC_METADATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mfu->arcs_list[ARC_BUFC_DATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], | ||||
| 	    sizeof (arc_buf_hdr_t), | ||||
| 	arc_l2c_only->arcs_list[ARC_BUFC_DATA] = | ||||
| 	    multilist_create(sizeof (arc_buf_hdr_t), | ||||
| 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), | ||||
| 	    arc_state_multilist_index_func); | ||||
| 
 | ||||
| @ -6373,16 +6373,16 @@ arc_state_fini(void) | ||||
| 	refcount_destroy(&arc_mfu_ghost->arcs_size); | ||||
| 	refcount_destroy(&arc_l2c_only->arcs_size); | ||||
| 
 | ||||
| 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); | ||||
| 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); | ||||
| 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); | ||||
| } | ||||
| 
 | ||||
| uint64_t | ||||
| @ -7065,16 +7065,16 @@ l2arc_sublist_lock(int list_num) | ||||
| 
 | ||||
| 	switch (list_num) { | ||||
| 	case 0: | ||||
| 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; | ||||
| 		ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; | ||||
| 		break; | ||||
| 	case 1: | ||||
| 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; | ||||
| 		ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; | ||||
| 		break; | ||||
| 	case 2: | ||||
| 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; | ||||
| 		ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; | ||||
| 		break; | ||||
| 	case 3: | ||||
| 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; | ||||
| 		ml = arc_mru->arcs_list[ARC_BUFC_DATA]; | ||||
| 		break; | ||||
| 	default: | ||||
| 		return (NULL); | ||||
|  | ||||
| @ -21,7 +21,7 @@ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved. | ||||
|  * Copyright (c) 2012, 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| @ -104,7 +104,7 @@ static boolean_t dbuf_evict_thread_exit; | ||||
|  * Dbufs that are aged out of the cache will be immediately destroyed and | ||||
|  * become eligible for arc eviction. | ||||
|  */ | ||||
| static multilist_t dbuf_cache; | ||||
| static multilist_t *dbuf_cache; | ||||
| static refcount_t dbuf_cache_size; | ||||
| unsigned long  dbuf_cache_max_bytes = 100 * 1024 * 1024; | ||||
| 
 | ||||
| @ -491,8 +491,8 @@ dbuf_cache_above_lowater(void) | ||||
| static void | ||||
| dbuf_evict_one(void) | ||||
| { | ||||
| 	int idx = multilist_get_random_index(&dbuf_cache); | ||||
| 	multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); | ||||
| 	int idx = multilist_get_random_index(dbuf_cache); | ||||
| 	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx); | ||||
| 	dmu_buf_impl_t *db; | ||||
| 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); | ||||
| 
 | ||||
| @ -671,7 +671,7 @@ retry: | ||||
| 	 */ | ||||
| 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); | ||||
| 
 | ||||
| 	multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), | ||||
| 	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t), | ||||
| 	    offsetof(dmu_buf_impl_t, db_cache_link), | ||||
| 	    dbuf_cache_multilist_index_func); | ||||
| 	refcount_create(&dbuf_cache_size); | ||||
| @ -719,7 +719,7 @@ dbuf_fini(void) | ||||
| 	cv_destroy(&dbuf_evict_cv); | ||||
| 
 | ||||
| 	refcount_destroy(&dbuf_cache_size); | ||||
| 	multilist_destroy(&dbuf_cache); | ||||
| 	multilist_destroy(dbuf_cache); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -2120,7 +2120,7 @@ dbuf_destroy(dmu_buf_impl_t *db) | ||||
| 	dbuf_clear_data(db); | ||||
| 
 | ||||
| 	if (multilist_link_active(&db->db_cache_link)) { | ||||
| 		multilist_remove(&dbuf_cache, db); | ||||
| 		multilist_remove(dbuf_cache, db); | ||||
| 		(void) refcount_remove_many(&dbuf_cache_size, | ||||
| 		    db->db.db_size, db); | ||||
| 	} | ||||
| @ -2690,7 +2690,7 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) | ||||
| 
 | ||||
| 	if (multilist_link_active(&dh->dh_db->db_cache_link)) { | ||||
| 		ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); | ||||
| 		multilist_remove(&dbuf_cache, dh->dh_db); | ||||
| 		multilist_remove(dbuf_cache, dh->dh_db); | ||||
| 		(void) refcount_remove_many(&dbuf_cache_size, | ||||
| 		    dh->dh_db->db.db_size, dh->dh_db); | ||||
| 	} | ||||
| @ -2962,7 +2962,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) | ||||
| 			    db->db_pending_evict) { | ||||
| 				dbuf_destroy(db); | ||||
| 			} else if (!multilist_link_active(&db->db_cache_link)) { | ||||
| 				multilist_insert(&dbuf_cache, db); | ||||
| 				multilist_insert(dbuf_cache, db); | ||||
| 				(void) refcount_add_many(&dbuf_cache_size, | ||||
| 				    db->db.db_size, db); | ||||
| 				mutex_exit(&db->db_mtx); | ||||
|  | ||||
| @ -21,7 +21,7 @@ | ||||
| 
 | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. | ||||
|  * Copyright (c) 2013, Joyent, Inc. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
| @ -343,6 +343,38 @@ dmu_objset_byteswap(void *buf, size_t size) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * The hash is a CRC-based hash of the objset_t pointer and the object number. | ||||
|  */ | ||||
| static uint64_t | ||||
| dnode_hash(const objset_t *os, uint64_t obj) | ||||
| { | ||||
| 	uintptr_t osv = (uintptr_t)os; | ||||
| 	uint64_t crc = -1ULL; | ||||
| 
 | ||||
| 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); | ||||
| 	/*
 | ||||
| 	 * The low 6 bits of the pointer don't have much entropy, because | ||||
| 	 * the objset_t is larger than 2^6 bytes long. | ||||
| 	 */ | ||||
| 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; | ||||
| 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; | ||||
| 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; | ||||
| 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; | ||||
| 
 | ||||
| 	crc ^= (osv>>14) ^ (obj>>24); | ||||
| 
 | ||||
| 	return (crc); | ||||
| } | ||||
| 
 | ||||
| unsigned int | ||||
| dnode_multilist_index_func(multilist_t *ml, void *obj) | ||||
| { | ||||
| 	dnode_t *dn = obj; | ||||
| 	return (dnode_hash(dn->dn_objset, dn->dn_object) % | ||||
| 	    multilist_get_num_sublists(ml)); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, | ||||
|     objset_t **osp) | ||||
| @ -500,10 +532,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, | ||||
| 	os->os_zil = zil_alloc(os, &os->os_zil_header); | ||||
| 
 | ||||
| 	for (i = 0; i < TXG_SIZE; i++) { | ||||
| 		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), | ||||
| 		    offsetof(dnode_t, dn_dirty_link[i])); | ||||
| 		list_create(&os->os_free_dnodes[i], sizeof (dnode_t), | ||||
| 		    offsetof(dnode_t, dn_dirty_link[i])); | ||||
| 		os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), | ||||
| 		    offsetof(dnode_t, dn_dirty_link[i]), | ||||
| 		    dnode_multilist_index_func); | ||||
| 	} | ||||
| 	list_create(&os->os_dnodes, sizeof (dnode_t), | ||||
| 	    offsetof(dnode_t, dn_link)); | ||||
| @ -513,6 +544,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, | ||||
| 	list_link_init(&os->os_evicting_node); | ||||
| 
 | ||||
| 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 
 | ||||
| @ -811,8 +843,12 @@ dmu_objset_evict_done(objset_t *os) | ||||
| 	rw_exit(&os_lock); | ||||
| 
 | ||||
| 	mutex_destroy(&os->os_lock); | ||||
| 	mutex_destroy(&os->os_userused_lock); | ||||
| 	mutex_destroy(&os->os_obj_lock); | ||||
| 	mutex_destroy(&os->os_user_ptr_lock); | ||||
| 	for (int i = 0; i < TXG_SIZE; i++) { | ||||
| 		multilist_destroy(os->os_dirty_dnodes[i]); | ||||
| 	} | ||||
| 	spa_evicting_os_deregister(os->os_spa, os); | ||||
| 	kmem_free(os, sizeof (objset_t)); | ||||
| } | ||||
| @ -1153,11 +1189,11 @@ dmu_objset_upgrade_stop(objset_t *os) | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) | ||||
| dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) | ||||
| { | ||||
| 	dnode_t *dn; | ||||
| 
 | ||||
| 	while ((dn = list_head(list))) { | ||||
| 	while ((dn = multilist_sublist_head(list)) != NULL) { | ||||
| 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); | ||||
| 		ASSERT(dn->dn_dbuf->db_data_pending); | ||||
| 		/*
 | ||||
| @ -1168,11 +1204,12 @@ dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) | ||||
| 		ASSERT(dn->dn_zio); | ||||
| 
 | ||||
| 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); | ||||
| 		list_remove(list, dn); | ||||
| 		multilist_sublist_remove(list, dn); | ||||
| 
 | ||||
| 		if (newlist) { | ||||
| 		multilist_t *newlist = dn->dn_objset->os_synced_dnodes; | ||||
| 		if (newlist != NULL) { | ||||
| 			(void) dnode_add_ref(dn, newlist); | ||||
| 			list_insert_tail(newlist, dn); | ||||
| 			multilist_insert(newlist, dn); | ||||
| 		} | ||||
| 
 | ||||
| 		dnode_sync(dn, tx); | ||||
| @ -1229,6 +1266,29 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) | ||||
| 	kmem_free(bp, sizeof (*bp)); | ||||
| } | ||||
| 
 | ||||
| typedef struct sync_dnodes_arg { | ||||
| 	multilist_t *sda_list; | ||||
| 	int sda_sublist_idx; | ||||
| 	multilist_t *sda_newlist; | ||||
| 	dmu_tx_t *sda_tx; | ||||
| } sync_dnodes_arg_t; | ||||
| 
 | ||||
| static void | ||||
| sync_dnodes_task(void *arg) | ||||
| { | ||||
| 	sync_dnodes_arg_t *sda = arg; | ||||
| 
 | ||||
| 	multilist_sublist_t *ms = | ||||
| 	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); | ||||
| 
 | ||||
| 	dmu_objset_sync_dnodes(ms, sda->sda_tx); | ||||
| 
 | ||||
| 	multilist_sublist_unlock(ms); | ||||
| 
 | ||||
| 	kmem_free(sda, sizeof (*sda)); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* called from dsl */ | ||||
| void | ||||
| dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) | ||||
| @ -1238,7 +1298,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) | ||||
| 	zio_prop_t zp; | ||||
| 	zio_t *zio; | ||||
| 	list_t *list; | ||||
| 	list_t *newlist = NULL; | ||||
| 	dbuf_dirty_record_t *dr; | ||||
| 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); | ||||
| 	*blkptr_copy = *os->os_rootbp; | ||||
| @ -1292,20 +1351,36 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) | ||||
| 	txgoff = tx->tx_txg & TXG_MASK; | ||||
| 
 | ||||
| 	if (dmu_objset_userused_enabled(os)) { | ||||
| 		newlist = &os->os_synced_dnodes; | ||||
| 		/*
 | ||||
| 		 * We must create the list here because it uses the | ||||
| 		 * dn_dirty_link[] of this txg. | ||||
| 		 * dn_dirty_link[] of this txg.  But it may already | ||||
| 		 * exist because we call dsl_dataset_sync() twice per txg. | ||||
| 		 */ | ||||
| 		list_create(newlist, sizeof (dnode_t), | ||||
| 		    offsetof(dnode_t, dn_dirty_link[txgoff])); | ||||
| 		if (os->os_synced_dnodes == NULL) { | ||||
| 			os->os_synced_dnodes = | ||||
| 			    multilist_create(sizeof (dnode_t), | ||||
| 			    offsetof(dnode_t, dn_dirty_link[txgoff]), | ||||
| 			    dnode_multilist_index_func); | ||||
| 		} else { | ||||
| 			ASSERT3U(os->os_synced_dnodes->ml_offset, ==, | ||||
| 			    offsetof(dnode_t, dn_dirty_link[txgoff])); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); | ||||
| 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); | ||||
| 	for (int i = 0; | ||||
| 	    i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) { | ||||
| 		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); | ||||
| 		sda->sda_list = os->os_dirty_dnodes[txgoff]; | ||||
| 		sda->sda_sublist_idx = i; | ||||
| 		sda->sda_tx = tx; | ||||
| 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, | ||||
| 		    sync_dnodes_task, sda, 0); | ||||
| 		/* callback frees sda */ | ||||
| 	} | ||||
| 	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); | ||||
| 
 | ||||
| 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; | ||||
| 	while ((dr = list_head(list))) { | ||||
| 	while ((dr = list_head(list)) != NULL) { | ||||
| 		ASSERT0(dr->dr_dbuf->db_level); | ||||
| 		list_remove(list, dr); | ||||
| 		if (dr->dr_zio) | ||||
| @ -1329,8 +1404,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) | ||||
| boolean_t | ||||
| dmu_objset_is_dirty(objset_t *os, uint64_t txg) | ||||
| { | ||||
| 	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || | ||||
| 	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); | ||||
| 	return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); | ||||
| } | ||||
| 
 | ||||
| static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; | ||||
| @ -1395,8 +1469,15 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) | ||||
| 	cookie = NULL; | ||||
| 	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, | ||||
| 	    &cookie)) != NULL) { | ||||
| 		/*
 | ||||
| 		 * os_userused_lock protects against concurrent calls to | ||||
| 		 * zap_increment_int().  It's needed because zap_increment_int() | ||||
| 		 * is not thread-safe (i.e. not atomic). | ||||
| 		 */ | ||||
| 		mutex_enter(&os->os_userused_lock); | ||||
| 		VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, | ||||
| 		    uqn->uqn_id, uqn->uqn_delta, tx)); | ||||
| 		mutex_exit(&os->os_userused_lock); | ||||
| 		kmem_free(uqn, sizeof (*uqn)); | ||||
| 	} | ||||
| 	avl_destroy(&cache->uqc_user_deltas); | ||||
| @ -1404,8 +1485,10 @@ do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) | ||||
| 	cookie = NULL; | ||||
| 	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, | ||||
| 	    &cookie)) != NULL) { | ||||
| 		mutex_enter(&os->os_userused_lock); | ||||
| 		VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, | ||||
| 		    uqn->uqn_id, uqn->uqn_delta, tx)); | ||||
| 		mutex_exit(&os->os_userused_lock); | ||||
| 		kmem_free(uqn, sizeof (*uqn)); | ||||
| 	} | ||||
| 	avl_destroy(&cache->uqc_group_deltas); | ||||
| @ -1469,35 +1552,38 @@ do_userobjquota_update(userquota_cache_t *cache, uint64_t flags, | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) | ||||
| typedef struct userquota_updates_arg { | ||||
| 	objset_t *uua_os; | ||||
| 	int uua_sublist_idx; | ||||
| 	dmu_tx_t *uua_tx; | ||||
| } userquota_updates_arg_t; | ||||
| 
 | ||||
| static void | ||||
| userquota_updates_task(void *arg) | ||||
| { | ||||
| 	userquota_updates_arg_t *uua = arg; | ||||
| 	objset_t *os = uua->uua_os; | ||||
| 	dmu_tx_t *tx = uua->uua_tx; | ||||
| 	dnode_t *dn; | ||||
| 	list_t *list = &os->os_synced_dnodes; | ||||
| 	userquota_cache_t cache = { { 0 } }; | ||||
| 
 | ||||
| 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); | ||||
| 	multilist_sublist_t *list = | ||||
| 	    multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); | ||||
| 
 | ||||
| 	ASSERT(multilist_sublist_head(list) == NULL || | ||||
| 	    dmu_objset_userused_enabled(os)); | ||||
| 	avl_create(&cache.uqc_user_deltas, userquota_compare, | ||||
| 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); | ||||
| 	avl_create(&cache.uqc_group_deltas, userquota_compare, | ||||
| 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); | ||||
| 
 | ||||
| 	while ((dn = list_head(list))) { | ||||
| 	while ((dn = multilist_sublist_head(list)) != NULL) { | ||||
| 		int flags; | ||||
| 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); | ||||
| 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || | ||||
| 		    dn->dn_phys->dn_flags & | ||||
| 		    DNODE_FLAG_USERUSED_ACCOUNTED); | ||||
| 
 | ||||
| 		/* Allocate the user/groupused objects if necessary. */ | ||||
| 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { | ||||
| 			VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, | ||||
| 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); | ||||
| 			VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, | ||||
| 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); | ||||
| 		} | ||||
| 
 | ||||
| 		flags = dn->dn_id_flags; | ||||
| 		ASSERT(flags); | ||||
| 		if (flags & DN_ID_OLD_EXIST)  { | ||||
| @ -1530,10 +1616,42 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) | ||||
| 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); | ||||
| 		mutex_exit(&dn->dn_mtx); | ||||
| 
 | ||||
| 		list_remove(list, dn); | ||||
| 		dnode_rele(dn, list); | ||||
| 		multilist_sublist_remove(list, dn); | ||||
| 		dnode_rele(dn, os->os_synced_dnodes); | ||||
| 	} | ||||
| 	do_userquota_cacheflush(os, &cache, tx); | ||||
| 	multilist_sublist_unlock(list); | ||||
| 	kmem_free(uua, sizeof (*uua)); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) | ||||
| { | ||||
| 	if (!dmu_objset_userused_enabled(os)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/* Allocate the user/groupused objects if necessary. */ | ||||
| 	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { | ||||
| 		VERIFY0(zap_create_claim(os, | ||||
| 		    DMU_USERUSED_OBJECT, | ||||
| 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); | ||||
| 		VERIFY0(zap_create_claim(os, | ||||
| 		    DMU_GROUPUSED_OBJECT, | ||||
| 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); | ||||
| 	} | ||||
| 
 | ||||
| 	for (int i = 0; | ||||
| 	    i < multilist_get_num_sublists(os->os_synced_dnodes); i++) { | ||||
| 		userquota_updates_arg_t *uua = | ||||
| 		    kmem_alloc(sizeof (*uua), KM_SLEEP); | ||||
| 		uua->uua_os = os; | ||||
| 		uua->uua_sublist_idx = i; | ||||
| 		uua->uua_tx = tx; | ||||
| 		/* note: caller does taskq_wait() */ | ||||
| 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, | ||||
| 		    userquota_updates_task, uua, 0); | ||||
| 		/* callback frees uua */ | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -1421,13 +1421,14 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) | ||||
| 	 */ | ||||
| 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx); | ||||
| 
 | ||||
| 	mutex_enter(&os->os_lock); | ||||
| 	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; | ||||
| 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we are already marked dirty, we're done. | ||||
| 	 */ | ||||
| 	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { | ||||
| 		mutex_exit(&os->os_lock); | ||||
| 		multilist_sublist_unlock(mls); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| @ -1441,13 +1442,9 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) | ||||
| 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", | ||||
| 	    dn->dn_object, txg); | ||||
| 
 | ||||
| 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { | ||||
| 		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); | ||||
| 	} else { | ||||
| 		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); | ||||
| 	} | ||||
| 	multilist_sublist_insert_head(mls, dn); | ||||
| 
 | ||||
| 	mutex_exit(&os->os_lock); | ||||
| 	multilist_sublist_unlock(mls); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The dnode maintains a hold on its containing dbuf as | ||||
| @ -1468,13 +1465,6 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) | ||||
| void | ||||
| dnode_free(dnode_t *dn, dmu_tx_t *tx) | ||||
| { | ||||
| 	int txgoff = tx->tx_txg & TXG_MASK; | ||||
| 
 | ||||
| 	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); | ||||
| 
 | ||||
| 	/* we should be the only holder... hopefully */ | ||||
| 	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ | ||||
| 
 | ||||
| 	mutex_enter(&dn->dn_mtx); | ||||
| 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { | ||||
| 		mutex_exit(&dn->dn_mtx); | ||||
| @ -1483,19 +1473,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) | ||||
| 	dn->dn_free_txg = tx->tx_txg; | ||||
| 	mutex_exit(&dn->dn_mtx); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the dnode is already dirty, it needs to be moved from | ||||
| 	 * the dirty list to the free list. | ||||
| 	 */ | ||||
| 	mutex_enter(&dn->dn_objset->os_lock); | ||||
| 	if (list_link_active(&dn->dn_dirty_link[txgoff])) { | ||||
| 		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); | ||||
| 		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); | ||||
| 		mutex_exit(&dn->dn_objset->os_lock); | ||||
| 	} else { | ||||
| 		mutex_exit(&dn->dn_objset->os_lock); | ||||
| 		dnode_setdirty(dn, tx); | ||||
| 	} | ||||
| 	dnode_setdirty(dn, tx); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | ||||
| @ -21,7 +21,7 @@ | ||||
| 
 | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2012, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2012, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
|  | ||||
| @ -1699,11 +1699,16 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | ||||
| void | ||||
| dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) | ||||
| { | ||||
| 	ASSERTV(objset_t *os = ds->ds_objset); | ||||
| 	objset_t *os = ds->ds_objset; | ||||
| 
 | ||||
| 	bplist_iterate(&ds->ds_pending_deadlist, | ||||
| 	    deadlist_enqueue_cb, &ds->ds_deadlist, tx); | ||||
| 
 | ||||
| 	if (os->os_synced_dnodes != NULL) { | ||||
| 		multilist_destroy(os->os_synced_dnodes); | ||||
| 		os->os_synced_dnodes = NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); | ||||
| 
 | ||||
| 	dmu_buf_rele(ds->ds_dbuf, ds); | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2013 Steven Hartland. All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved. | ||||
| @ -132,6 +132,11 @@ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; | ||||
| hrtime_t zfs_throttle_delay = MSEC2NSEC(10); | ||||
| hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); | ||||
| 
 | ||||
| /*
 | ||||
|  * This determines the number of threads used by the dp_sync_taskq. | ||||
|  */ | ||||
| int zfs_sync_taskq_batch_pct = 75; | ||||
| 
 | ||||
| int | ||||
| dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) | ||||
| { | ||||
| @ -168,6 +173,10 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) | ||||
| 	txg_list_create(&dp->dp_sync_tasks, | ||||
| 	    offsetof(dsl_sync_task_t, dst_node)); | ||||
| 
 | ||||
| 	dp->dp_sync_taskq = taskq_create("dp_sync_taskq", | ||||
| 	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, | ||||
| 	    TASKQ_THREADS_CPU_PCT); | ||||
| 
 | ||||
| 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); | ||||
| 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); | ||||
| 
 | ||||
| @ -326,6 +335,8 @@ dsl_pool_close(dsl_pool_t *dp) | ||||
| 	txg_list_destroy(&dp->dp_sync_tasks); | ||||
| 	txg_list_destroy(&dp->dp_dirty_dirs); | ||||
| 
 | ||||
| 	taskq_destroy(dp->dp_sync_taskq); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We can't set retry to TRUE since we're explicitly specifying | ||||
| 	 * a spa to flush. This is good enough; any missed buffers for | ||||
| @ -514,12 +525,15 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * After the data blocks have been written (ensured by the zio_wait() | ||||
| 	 * above), update the user/group space accounting. | ||||
| 	 * above), update the user/group space accounting.  This happens | ||||
| 	 * in tasks dispatched to dp_sync_taskq, so wait for them before | ||||
| 	 * continuing. | ||||
| 	 */ | ||||
| 	for (ds = list_head(&synced_datasets); ds != NULL; | ||||
| 	    ds = list_next(&synced_datasets, ds)) { | ||||
| 		dmu_objset_do_userquota_updates(ds->ds_objset, tx); | ||||
| 	} | ||||
| 	taskq_wait(dp->dp_sync_taskq); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Sync the datasets again to push out the changes due to | ||||
| @ -567,8 +581,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) | ||||
| 		dp->dp_mos_uncompressed_delta = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || | ||||
| 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { | ||||
| 	if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) { | ||||
| 		dsl_pool_sync_mos(dp, tx); | ||||
| 	} | ||||
| 
 | ||||
| @ -619,7 +632,8 @@ int | ||||
| dsl_pool_sync_context(dsl_pool_t *dp) | ||||
| { | ||||
| 	return (curthread == dp->dp_tx.tx_sync_thread || | ||||
| 	    spa_is_initializing(dp->dp_spa)); | ||||
| 	    spa_is_initializing(dp->dp_spa) || | ||||
| 	    taskq_member(dp->dp_sync_taskq, curthread)); | ||||
| } | ||||
| 
 | ||||
| uint64_t | ||||
| @ -1116,5 +1130,9 @@ MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); | ||||
| 
 | ||||
| module_param(zfs_delay_scale, ulong, 0644); | ||||
| MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); | ||||
| 
 | ||||
| module_param(zfs_sync_taskq_batch_pct, int, 0644); | ||||
| MODULE_PARM_DESC(zfs_sync_taskq_batch_pct, | ||||
| 	"max percent of CPUs that are used to sync dirty data"); | ||||
| /* END CSTYLED */ | ||||
| #endif | ||||
|  | ||||
| @ -68,18 +68,16 @@ multilist_d2l(multilist_t *ml, void *obj) | ||||
|  *     requirement, but a general rule of thumb in order to garner the | ||||
|  *     best multi-threaded performance out of the data structure. | ||||
|  */ | ||||
| static void | ||||
| multilist_create_impl(multilist_t *ml, size_t size, size_t offset, | ||||
| static multilist_t * | ||||
| multilist_create_impl(size_t size, size_t offset, | ||||
|     unsigned int num, multilist_sublist_index_func_t *index_func) | ||||
| { | ||||
| 	int i; | ||||
| 
 | ||||
| 	ASSERT3P(ml, !=, NULL); | ||||
| 	ASSERT3U(size, >, 0); | ||||
| 	ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); | ||||
| 	ASSERT3U(num, >, 0); | ||||
| 	ASSERT3P(index_func, !=, NULL); | ||||
| 
 | ||||
| 	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); | ||||
| 	ml->ml_offset = offset; | ||||
| 	ml->ml_num_sublists = num; | ||||
| 	ml->ml_index_func = index_func; | ||||
| @ -89,20 +87,21 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset, | ||||
| 
 | ||||
| 	ASSERT3P(ml->ml_sublists, !=, NULL); | ||||
| 
 | ||||
| 	for (i = 0; i < ml->ml_num_sublists; i++) { | ||||
| 	for (int i = 0; i < ml->ml_num_sublists; i++) { | ||||
| 		multilist_sublist_t *mls = &ml->ml_sublists[i]; | ||||
| 		mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL); | ||||
| 		list_create(&mls->mls_list, size, offset); | ||||
| 	} | ||||
| 	return (ml); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Initialize a new sublist, using the default number of sublists | ||||
|  * Allocate a new multilist, using the default number of sublists | ||||
|  * (the number of CPUs, or at least 4, or the tunable | ||||
|  * zfs_multilist_num_sublists). | ||||
|  */ | ||||
| void | ||||
| multilist_create(multilist_t *ml, size_t size, size_t offset, | ||||
| multilist_t * | ||||
| multilist_create(size_t size, size_t offset, | ||||
|     multilist_sublist_index_func_t *index_func) | ||||
| { | ||||
| 	int num_sublists; | ||||
| @ -113,7 +112,7 @@ multilist_create(multilist_t *ml, size_t size, size_t offset, | ||||
| 		num_sublists = MAX(boot_ncpus, 4); | ||||
| 	} | ||||
| 
 | ||||
| 	multilist_create_impl(ml, size, offset, num_sublists, index_func); | ||||
| 	return (multilist_create_impl(size, offset, num_sublists, index_func)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -141,6 +140,7 @@ multilist_destroy(multilist_t *ml) | ||||
| 
 | ||||
| 	ml->ml_num_sublists = 0; | ||||
| 	ml->ml_offset = 0; | ||||
| 	kmem_free(ml, sizeof (multilist_t)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -294,6 +294,13 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) | ||||
| 	return (mls); | ||||
| } | ||||
| 
 | ||||
| /* Lock and return the sublist that would be used to store the specified obj */ | ||||
| multilist_sublist_t * | ||||
| multilist_sublist_lock_obj(multilist_t *ml, void *obj) | ||||
| { | ||||
| 	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| multilist_sublist_unlock(multilist_sublist_t *mls) | ||||
| { | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved. | ||||
|  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. | ||||
|  * Copyright 2013 Saso Kiselkov. All rights reserved. | ||||
| @ -612,7 +612,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) | ||||
| 	if (altroot) | ||||
| 		spa->spa_root = spa_strdup(altroot); | ||||
| 
 | ||||
| 	avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, | ||||
| 	avl_create(&spa->spa_alloc_tree, zio_bookmark_compare, | ||||
| 	    sizeof (zio_t), offsetof(zio_t, io_alloc_node)); | ||||
| 
 | ||||
| 	/*
 | ||||
|  | ||||
| @ -20,7 +20,7 @@ | ||||
|  */ | ||||
| /*
 | ||||
|  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||||
|  * Copyright (c) 2011, 2016 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011, 2017 by Delphix. All rights reserved. | ||||
|  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. | ||||
|  */ | ||||
| 
 | ||||
| @ -544,21 +544,37 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) | ||||
| } | ||||
| 
 | ||||
| int | ||||
| zio_timestamp_compare(const void *x1, const void *x2) | ||||
| zio_bookmark_compare(const void *x1, const void *x2) | ||||
| { | ||||
| 	const zio_t *z1 = x1; | ||||
| 	const zio_t *z2 = x2; | ||||
| 	int cmp; | ||||
| 
 | ||||
| 	cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp); | ||||
| 	if (likely(cmp)) | ||||
| 		return (cmp); | ||||
| 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) | ||||
| 		return (-1); | ||||
| 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	cmp = AVL_CMP(z1->io_offset, z2->io_offset); | ||||
| 	if (likely(cmp)) | ||||
| 		return (cmp); | ||||
| 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) | ||||
| 		return (-1); | ||||
| 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	return (AVL_PCMP(z1, z2)); | ||||
| 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) | ||||
| 		return (-1); | ||||
| 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) | ||||
| 		return (-1); | ||||
| 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	if (z1 < z2) | ||||
| 		return (-1); | ||||
| 	if (z1 > z2) | ||||
| 		return (1); | ||||
| 
 | ||||
| 	return (0); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -2953,8 +2969,6 @@ zio_dva_throttle(zio_t *zio) | ||||
| 		return (ZIO_PIPELINE_CONTINUE); | ||||
| 
 | ||||
| 	if (nio != NULL) { | ||||
| 		ASSERT3U(nio->io_queued_timestamp, <=, | ||||
| 		    zio->io_queued_timestamp); | ||||
| 		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); | ||||
| 		/*
 | ||||
| 		 * We are passing control to a new zio so make sure that | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Matthew Ahrens
						Matthew Ahrens