From dfbe26750308d757d20af8eb5aefef49ec65d4a8 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 12 Dec 2017 15:46:58 -0800 Subject: [PATCH] OpenZFS 9617 - too-frequent TXG sync causes excessive write inflation Porting notes: * Renamed zfs_dirty_data_sync_pct to zfs_dirty_data_sync_percent and changed the type to be consistent with the other dirty module params. * Updated zfs-module-parameters.5 accordingly. Authored by: Matthew Ahrens Reviewed by: Serapheim Dimitropoulos Reviewed by: Brad Lewis Reviewed by: George Wilson Reviewed by: Andrew Stormont Reviewed-by: George Melikov Approved by: Robert Mustacchi Ported-by: Brian Behlendorf OpenZFS-issue: https://illumos.org/issues/9617 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7928f4ba Closes #7976 --- include/sys/dsl_pool.h | 2 +- man/man5/zfs-module-parameters.5 | 8 +++++--- module/zfs/dsl_pool.c | 12 ++++++++---- module/zfs/txg.c | 7 ++++--- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 01870e867..56317cf73 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -57,7 +57,7 @@ struct dsl_crypto_params; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; -extern unsigned long zfs_dirty_data_sync; +extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; extern int zfs_delay_min_dirty_percent; diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index b02d6aae7..d7b53d161 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1225,12 +1225,14 @@ Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR. .sp .ne 2 .na -\fBzfs_dirty_data_sync\fR (int) +\fBzfs_dirty_data_sync_percent\fR (int) .ad .RS 12n -Start syncing out a transaction group if there is at least this much dirty data. +Start syncing out a transaction group if there's at least this much dirty data +as a percentage of \fBzfs_dirty_data_max\fR. This should be less than +\fBzfs_vdev_async_write_active_min_dirty_percent\fR. .sp -Default value: \fB67,108,864\fR. +Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. .RE .sp diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index bd2055710..b940382d1 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -106,9 +106,11 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* - * If there is at least this much dirty data, push out a txg. + * If there's at least this much dirty data (as a percentage of + * zfs_dirty_data_max), push out a txg. This should be less than + * zfs_vdev_async_write_active_min_dirty_percent. */ -unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024; +int zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in @@ -879,10 +881,12 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; boolean_t rv; mutex_enter(&dp->dp_lock); - if (dp->dp_dirty_total > zfs_dirty_data_sync) + if (dp->dp_dirty_total > dirty_min_bytes) txg_kick(dp); rv = (dp->dp_dirty_total > delay_min_bytes); mutex_exit(&dp->dp_lock); @@ -1345,7 +1349,7 @@ module_param(zfs_dirty_data_max_max, ulong, 0444); MODULE_PARM_DESC(zfs_dirty_data_max_max, "zfs_dirty_data_max upper bound in bytes"); -module_param(zfs_dirty_data_sync, ulong, 0644); +module_param(zfs_dirty_data_sync_percent, int, 0644); MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); module_param(zfs_delay_scale, ulong, 0644); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index cfc1d0a35..aad1a8e57 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -517,7 +517,8 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - txg_stat_t *ts; + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -529,7 +530,7 @@ txg_sync_thread(void *arg) !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < zfs_dirty_data_sync) { + dp->dp_dirty_total < dirty_min_bytes) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); @@ -561,7 +562,7 @@ txg_sync_thread(void *arg) tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); - ts = spa_txg_history_init_io(spa, txg, dp); + txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",