mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	Add Module Parameter Regarding Log Size Limit
zfs_wrlog_data_max The upper limit of TX_WRITE log data. Once it is reached, write operation is blocked, until log data is cleared out after txg sync. It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. Reviewed-by: Prakash Surya <prakash.surya@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: jxdking <lostking2008@hotmail.com> Closes #12284
This commit is contained in:
		
							parent
							
								
									999830a021
								
							
						
					
					
						commit
						d05f3039f7
					
				| @ -124,6 +124,7 @@ typedef struct dmu_tx_stats { | |||||||
| 	kstat_named_t dmu_tx_dirty_throttle; | 	kstat_named_t dmu_tx_dirty_throttle; | ||||||
| 	kstat_named_t dmu_tx_dirty_delay; | 	kstat_named_t dmu_tx_dirty_delay; | ||||||
| 	kstat_named_t dmu_tx_dirty_over_max; | 	kstat_named_t dmu_tx_dirty_over_max; | ||||||
|  | 	kstat_named_t dmu_tx_wrlog_over_max; | ||||||
| 	kstat_named_t dmu_tx_dirty_frees_delay; | 	kstat_named_t dmu_tx_dirty_frees_delay; | ||||||
| 	kstat_named_t dmu_tx_quota; | 	kstat_named_t dmu_tx_quota; | ||||||
| } dmu_tx_stats_t; | } dmu_tx_stats_t; | ||||||
|  | |||||||
| @ -40,6 +40,7 @@ | |||||||
| #include <sys/rrwlock.h> | #include <sys/rrwlock.h> | ||||||
| #include <sys/dsl_synctask.h> | #include <sys/dsl_synctask.h> | ||||||
| #include <sys/mmp.h> | #include <sys/mmp.h> | ||||||
|  | #include <sys/aggsum.h> | ||||||
| 
 | 
 | ||||||
| #ifdef	__cplusplus | #ifdef	__cplusplus | ||||||
| extern "C" { | extern "C" { | ||||||
| @ -58,6 +59,7 @@ struct dsl_deadlist; | |||||||
| 
 | 
 | ||||||
| extern unsigned long zfs_dirty_data_max; | extern unsigned long zfs_dirty_data_max; | ||||||
| extern unsigned long zfs_dirty_data_max_max; | extern unsigned long zfs_dirty_data_max_max; | ||||||
|  | extern unsigned long zfs_wrlog_data_max; | ||||||
| extern int zfs_dirty_data_sync_percent; | extern int zfs_dirty_data_sync_percent; | ||||||
| extern int zfs_dirty_data_max_percent; | extern int zfs_dirty_data_max_percent; | ||||||
| extern int zfs_dirty_data_max_max_percent; | extern int zfs_dirty_data_max_max_percent; | ||||||
| @ -118,6 +120,9 @@ typedef struct dsl_pool { | |||||||
| 	uint64_t dp_mos_compressed_delta; | 	uint64_t dp_mos_compressed_delta; | ||||||
| 	uint64_t dp_mos_uncompressed_delta; | 	uint64_t dp_mos_uncompressed_delta; | ||||||
| 
 | 
 | ||||||
|  | 	aggsum_t dp_wrlog_pertxg[TXG_SIZE]; | ||||||
|  | 	aggsum_t dp_wrlog_total; | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Time of most recently scheduled (furthest in the future) | 	 * Time of most recently scheduled (furthest in the future) | ||||||
| 	 * wakeup for delayed transactions. | 	 * wakeup for delayed transactions. | ||||||
| @ -158,6 +163,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); | |||||||
| uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, | uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, | ||||||
|     zfs_space_check_t slop_policy); |     zfs_space_check_t slop_policy); | ||||||
| uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); | uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); | ||||||
|  | void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); | ||||||
|  | boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); | ||||||
| void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); | void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); | ||||||
| void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); | void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); | ||||||
| void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); | void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); | ||||||
|  | |||||||
| @ -1096,6 +1096,18 @@ Start syncing out a transaction group if there's at least this much dirty data | |||||||
| This should be less than | This should be less than | ||||||
| .Sy zfs_vdev_async_write_active_min_dirty_percent . | .Sy zfs_vdev_async_write_active_min_dirty_percent . | ||||||
| . | . | ||||||
|  | .It Sy zfs_wrlog_data_max Ns = Pq int | ||||||
|  | The upper limit of write-transaction zil log data size in bytes. | ||||||
|  | Once it is reached, write operation is blocked, until log data is cleared out | ||||||
|  | after transaction group sync. Because of some overhead, it should be set | ||||||
|  | at least 2 times the size of | ||||||
|  | .Sy zfs_dirty_data_max | ||||||
|  | .No to prevent harming normal write throughput. | ||||||
|  | It also should be smaller than the size of the slog device if slog is present. | ||||||
|  | .Pp | ||||||
|  | Defaults to | ||||||
|  | .Sy zfs_dirty_data_max*2 | ||||||
|  | . | ||||||
| .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint | .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint | ||||||
| Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be | Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be | ||||||
| preallocated for a file in order to guarantee that later writes will not | preallocated for a file in order to guarantee that later writes will not | ||||||
|  | |||||||
| @ -8062,6 +8062,18 @@ arc_init(void) | |||||||
| 		zfs_dirty_data_max = MIN(zfs_dirty_data_max, | 		zfs_dirty_data_max = MIN(zfs_dirty_data_max, | ||||||
| 		    zfs_dirty_data_max_max); | 		    zfs_dirty_data_max_max); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (zfs_wrlog_data_max == 0) { | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * dp_wrlog_total is reduced for each txg at the end of | ||||||
|  | 		 * spa_sync(). However, dp_dirty_total is reduced every time | ||||||
|  | 		 * a block is written out. Thus under normal operation, | ||||||
|  | 		 * dp_wrlog_total could grow 2 times as big as | ||||||
|  | 		 * zfs_dirty_data_max. | ||||||
|  | 		 */ | ||||||
|  | 		zfs_wrlog_data_max = zfs_dirty_data_max * 2; | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void | void | ||||||
|  | |||||||
| @ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = { | |||||||
| 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 }, | 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 }, | ||||||
| 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 }, | 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 }, | ||||||
| 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 }, | 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 }, | ||||||
|  | 	{ "dmu_tx_wrlog_over_max",	KSTAT_DATA_UINT64 }, | ||||||
| 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 }, | 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 }, | ||||||
| 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 }, | 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 }, | ||||||
| }; | }; | ||||||
| @ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) | |||||||
| 		return (SET_ERROR(ERESTART)); | 		return (SET_ERROR(ERESTART)); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (!tx->tx_dirty_delayed && | ||||||
|  | 	    dsl_pool_wrlog_over_max(tx->tx_pool)) { | ||||||
|  | 		DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); | ||||||
|  | 		return (SET_ERROR(ERESTART)); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (!tx->tx_dirty_delayed && | 	if (!tx->tx_dirty_delayed && | ||||||
| 	    dsl_pool_need_dirty_delay(tx->tx_pool)) { | 	    dsl_pool_need_dirty_delay(tx->tx_pool)) { | ||||||
| 		tx->tx_wait_dirty = B_TRUE; | 		tx->tx_wait_dirty = B_TRUE; | ||||||
|  | |||||||
| @ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0; | |||||||
| int zfs_dirty_data_max_percent = 10; | int zfs_dirty_data_max_percent = 10; | ||||||
| int zfs_dirty_data_max_max_percent = 25; | int zfs_dirty_data_max_max_percent = 25; | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. | ||||||
|  |  * Once it is reached, write operation is blocked, | ||||||
|  |  * until log data is cleared out after txg sync. | ||||||
|  |  * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. | ||||||
|  |  */ | ||||||
|  | unsigned long zfs_wrlog_data_max = 0; | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * If there's at least this much dirty data (as a percentage of |  * If there's at least this much dirty data (as a percentage of | ||||||
|  * zfs_dirty_data_max), push out a txg.  This should be less than |  * zfs_dirty_data_max), push out a txg.  This should be less than | ||||||
| @ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) | |||||||
| 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); | 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); | ||||||
| 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); | 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); | ||||||
| 
 | 
 | ||||||
|  | 	aggsum_init(&dp->dp_wrlog_total, 0); | ||||||
|  | 	for (int i = 0; i < TXG_SIZE; i++) { | ||||||
|  | 		aggsum_init(&dp->dp_wrlog_pertxg[i], 0); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, | 	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, | ||||||
| 	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | | 	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | | ||||||
| 	    TASKQ_THREADS_CPU_PCT); | 	    TASKQ_THREADS_CPU_PCT); | ||||||
| @ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp) | |||||||
| 	rrw_destroy(&dp->dp_config_rwlock); | 	rrw_destroy(&dp->dp_config_rwlock); | ||||||
| 	mutex_destroy(&dp->dp_lock); | 	mutex_destroy(&dp->dp_lock); | ||||||
| 	cv_destroy(&dp->dp_spaceavail_cv); | 	cv_destroy(&dp->dp_spaceavail_cv); | ||||||
|  | 
 | ||||||
|  | 	ASSERT0(aggsum_value(&dp->dp_wrlog_total)); | ||||||
|  | 	aggsum_fini(&dp->dp_wrlog_total); | ||||||
|  | 	for (int i = 0; i < TXG_SIZE; i++) { | ||||||
|  | 		ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); | ||||||
|  | 		aggsum_fini(&dp->dp_wrlog_pertxg[i]); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	taskq_destroy(dp->dp_unlinked_drain_taskq); | 	taskq_destroy(dp->dp_unlinked_drain_taskq); | ||||||
| 	taskq_destroy(dp->dp_zrele_taskq); | 	taskq_destroy(dp->dp_zrele_taskq); | ||||||
| 	if (dp->dp_blkstats != NULL) | 	if (dp->dp_blkstats != NULL) | ||||||
| @ -590,6 +611,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) | |||||||
| 		cv_signal(&dp->dp_spaceavail_cv); | 		cv_signal(&dp->dp_spaceavail_cv); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void | ||||||
|  | dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) | ||||||
|  | { | ||||||
|  | 	ASSERT3S(size, >=, 0); | ||||||
|  | 
 | ||||||
|  | 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); | ||||||
|  | 	aggsum_add(&dp->dp_wrlog_total, size); | ||||||
|  | 
 | ||||||
|  | 	/* Choose a value slightly bigger than min dirty sync bytes */ | ||||||
|  | 	uint64_t sync_min = | ||||||
|  | 	    zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; | ||||||
|  | 	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) | ||||||
|  | 		txg_kick(dp, txg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | boolean_t | ||||||
|  | dsl_pool_wrlog_over_max(dsl_pool_t *dp) | ||||||
|  | { | ||||||
|  | 	return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void | ||||||
|  | dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) | ||||||
|  | { | ||||||
|  | 	int64_t delta; | ||||||
|  | 	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); | ||||||
|  | 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); | ||||||
|  | 	aggsum_add(&dp->dp_wrlog_total, delta); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #ifdef ZFS_DEBUG | #ifdef ZFS_DEBUG | ||||||
| static boolean_t | static boolean_t | ||||||
| dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) | dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) | ||||||
| @ -814,6 +865,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) | |||||||
| 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); | 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); | ||||||
| 		dmu_buf_rele(ds->ds_dbuf, zilog); | 		dmu_buf_rele(ds->ds_dbuf, zilog); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	dsl_pool_wrlog_clear(dp, txg); | ||||||
|  | 
 | ||||||
| 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); | 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -1409,6 +1463,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, | |||||||
| ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, | ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, | ||||||
| 	"Determines the dirty space limit"); | 	"Determines the dirty space limit"); | ||||||
| 
 | 
 | ||||||
|  | ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, | ||||||
|  | 	"The size limit of write-transaction zil log data"); | ||||||
|  | 
 | ||||||
| /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ | /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ | ||||||
| ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, | ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, | ||||||
| 	"zfs_dirty_data_max upper bound in bytes"); | 	"zfs_dirty_data_max upper bound in bytes"); | ||||||
|  | |||||||
| @ -538,6 +538,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, | |||||||
| 	itx_wr_state_t write_state; | 	itx_wr_state_t write_state; | ||||||
| 	uintptr_t fsync_cnt; | 	uintptr_t fsync_cnt; | ||||||
| 	uint64_t gen = 0; | 	uint64_t gen = 0; | ||||||
|  | 	ssize_t size = resid; | ||||||
| 
 | 
 | ||||||
| 	if (zil_replaying(zilog, tx) || zp->z_unlinked || | 	if (zil_replaying(zilog, tx) || zp->z_unlinked || | ||||||
| 	    zfs_xattr_owner_unlinked(zp)) { | 	    zfs_xattr_owner_unlinked(zp)) { | ||||||
| @ -623,6 +624,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, | |||||||
| 		off += len; | 		off += len; | ||||||
| 		resid -= len; | 		resid -= len; | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { | ||||||
|  | 		dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | |||||||
| @ -84,10 +84,8 @@ | |||||||
| #include <sys/zfs_rlock.h> | #include <sys/zfs_rlock.h> | ||||||
| #include <sys/spa_impl.h> | #include <sys/spa_impl.h> | ||||||
| #include <sys/zvol.h> | #include <sys/zvol.h> | ||||||
| 
 |  | ||||||
| #include <sys/zvol_impl.h> | #include <sys/zvol_impl.h> | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| unsigned int zvol_inhibit_dev = 0; | unsigned int zvol_inhibit_dev = 0; | ||||||
| unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; | unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; | ||||||
| 
 | 
 | ||||||
| @ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, | |||||||
| 	uint32_t blocksize = zv->zv_volblocksize; | 	uint32_t blocksize = zv->zv_volblocksize; | ||||||
| 	zilog_t *zilog = zv->zv_zilog; | 	zilog_t *zilog = zv->zv_zilog; | ||||||
| 	itx_wr_state_t write_state; | 	itx_wr_state_t write_state; | ||||||
|  | 	uint64_t sz = size; | ||||||
| 
 | 
 | ||||||
| 	if (zil_replaying(zilog, tx)) | 	if (zil_replaying(zilog, tx)) | ||||||
| 		return; | 		return; | ||||||
| @ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, | |||||||
| 		offset += len; | 		offset += len; | ||||||
| 		size -= len; | 		size -= len; | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { | ||||||
|  | 		dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Kevin Jin
						Kevin Jin