Refactor Log Size Limit

Original Log Size Limit implementation blocked all writes in case of
limit reached until the TXG is committed and the log is freed.  It
caused huge delays and following speed spikes in application writes.

This implementation instead smoothly throttles writes, using exactly
the same mechanism as used for dirty data.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: jxdking <lostking2008@hotmail.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Issue #12284
Closes #13476
This commit is contained in:
Alexander Motin
2022-05-24 12:46:35 -04:00
committed by Tony Hutter
parent 91e02156dd
commit 33223cbc3c
5 changed files with 54 additions and 33 deletions
+38 -22
View File
@@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
};
@@ -781,34 +781,49 @@ static void
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
{
dsl_pool_t *dp = tx->tx_pool;
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
hrtime_t wakeup, min_tx_time, now;
uint64_t delay_min_bytes, wrlog;
hrtime_t wakeup, tx_time = 0, now;
if (dirty <= delay_min_bytes)
/* Calculate minimum transaction time for the dirty data amount. */
delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
if (dirty > delay_min_bytes) {
/*
* The caller has already waited until we are under the max.
* We make them pass us the amount of dirty data so we don't
* have to handle the case of it being >= the max, which
* could cause a divide-by-zero if it's == the max.
*/
ASSERT3U(dirty, <, zfs_dirty_data_max);
tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
(zfs_dirty_data_max - dirty);
}
/* Calculate minimum transaction time for the TX_WRITE log size. */
wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
delay_min_bytes =
zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
if (wrlog >= zfs_wrlog_data_max) {
tx_time = zfs_delay_max_ns;
} else if (wrlog > delay_min_bytes) {
tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
(zfs_wrlog_data_max - wrlog), tx_time);
}
if (tx_time == 0)
return;
/*
* The caller has already waited until we are under the max.
* We make them pass us the amount of dirty data so we don't
* have to handle the case of it being >= the max, which could
* cause a divide-by-zero if it's == the max.
*/
ASSERT3U(dirty, <, zfs_dirty_data_max);
tx_time = MIN(tx_time, zfs_delay_max_ns);
now = gethrtime();
min_tx_time = zfs_delay_scale *
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
if (now > tx->tx_start + min_tx_time)
if (now > tx->tx_start + tx_time)
return;
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
uint64_t, min_tx_time);
uint64_t, tx_time);
mutex_enter(&dp->dp_lock);
wakeup = MAX(tx->tx_start + min_tx_time,
dp->dp_last_wakeup + min_tx_time);
wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
dp->dp_last_wakeup = wakeup;
mutex_exit(&dp->dp_lock);
@@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
}
if (!tx->tx_dirty_delayed &&
dsl_pool_wrlog_over_max(tx->tx_pool)) {
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
dsl_pool_need_wrlog_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;
DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
return (SET_ERROR(ERESTART));
}