ZIL: Detect single-threaded workloads

... by checking that previous block is fully written and flushed.
It allows to skip commit delays since we can give up on aggregation
in that case.  This removes zil_min_commit_timeout parameter, since
for single-threaded workloads it is not needed at all, while on very
fast devices even some multi-threaded workloads may get detected as
single-threaded and still bypass the wait.  To give multi-threaded
workloads more aggregation chances increase zfs_commit_timeout_pct
from 5 to 10%, as they should suffer less from additional latency.

Also single-threaded workloads detection allows in perspective better
prediction of the next block size.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15381
This commit is contained in:
Alexander Motin 2023-10-24 17:35:25 -04:00 committed by GitHub
parent e007908a16
commit 252f46be7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 44 additions and 60 deletions

View File

@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */ avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t; } zil_vdev_node_t;
#define ZIL_BURSTS 8
#define ZIL_PREV_BLKS 16 #define ZIL_PREV_BLKS 16
/* /*
@ -222,8 +223,9 @@ struct zilog {
clock_t zl_replay_time; /* lbolt of when replay started */ clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */ uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */ zil_header_t zl_old_header; /* debugging aid */
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ uint_t zl_parallel; /* workload is multi-threaded */
uint_t zl_prev_rotor; /* rotor for zl_prev[] */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */

View File

@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds
(currently 10 checksums over 10 seconds) (currently 10 checksums over 10 seconds)
or else the daemon may not trigger any action. or else the daemon may not trigger any action.
. .
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint .It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
This controls the amount of time that a ZIL block (lwb) will remain "open" This controls the amount of time that a ZIL block (lwb) will remain "open"
when it isn't "full", and it has a thread waiting for it to be committed to when it isn't "full", and it has a thread waiting for it to be committed to
stable storage. stable storage.
@ -2155,13 +2155,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock. space efficiency vs additional range lock/unlock.
. .
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
waiting for more records.
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
increasing log latency above allowed by
.Sy zfs_commit_timeout_pct .
.
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disable the cache flush commands that are normally sent to disk by Disable the cache flush commands that are normally sent to disk by
the ZIL after an LWB write has completed. the ZIL after an LWB write has completed.

View File

@ -91,15 +91,7 @@
* committed to stable storage. Please refer to the zil_commit_waiter() * committed to stable storage. Please refer to the zil_commit_waiter()
* function (and the comments within it) for more details. * function (and the comments within it) for more details.
*/ */
static uint_t zfs_commit_timeout_pct = 5; static uint_t zfs_commit_timeout_pct = 10;
/*
* Minimal time we care to delay commit waiting for more ZIL records.
* At least FreeBSD kernel can't sleep for less than 2us at its best.
* So requests to sleep for less then 5us is a waste of CPU time with
* a risk of significant log latency increase due to oversleep.
*/
static uint64_t zil_min_commit_timeout = 5000;
/* /*
* See zil.h for more information about these fields. * See zil.h for more information about these fields.
@ -2696,6 +2688,19 @@ zil_commit_writer_stall(zilog_t *zilog)
ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT(list_is_empty(&zilog->zl_lwb_list));
} }
static void
zil_burst_done(zilog_t *zilog)
{
if (!list_is_empty(&zilog->zl_itx_commit_list) ||
zilog->zl_cur_used == 0)
return;
if (zilog->zl_parallel)
zilog->zl_parallel--;
zilog->zl_cur_used = 0;
}
/* /*
* This function will traverse the commit list, creating new lwbs as * This function will traverse the commit list, creating new lwbs as
* needed, and committing the itxs from the commit list to these newly * needed, and committing the itxs from the commit list to these newly
@ -2710,7 +2715,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
list_t nolwb_waiters; list_t nolwb_waiters;
lwb_t *lwb, *plwb; lwb_t *lwb, *plwb;
itx_t *itx; itx_t *itx;
boolean_t first = B_TRUE;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@ -2736,9 +2740,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
zil_commit_activate_saxattr_feature(zilog); zil_commit_activate_saxattr_feature(zilog);
ASSERT(lwb->lwb_state == LWB_STATE_NEW || ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
lwb->lwb_state == LWB_STATE_OPENED); lwb->lwb_state == LWB_STATE_OPENED);
first = (lwb->lwb_state == LWB_STATE_NEW) &&
((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || /*
plwb->lwb_state == LWB_STATE_FLUSH_DONE); * If the lwb is still opened, it means the workload is really
* multi-threaded and we won the chance of write aggregation.
* If it is not opened yet, but previous lwb is still not
* flushed, it still means the workload is multi-threaded, but
* there was too much time between the commits to aggregate, so
* we try aggregation next times, but without too much hopes.
*/
if (lwb->lwb_state == LWB_STATE_OPENED) {
zilog->zl_parallel = ZIL_BURSTS;
} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
!= NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
zilog->zl_parallel = MAX(zilog->zl_parallel,
ZIL_BURSTS / 2);
}
} }
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@ -2813,7 +2830,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* Our lwb is done, leave the rest of * Our lwb is done, leave the rest of
* itx list to somebody else who care. * itx list to somebody else who care.
*/ */
first = B_FALSE; zilog->zl_parallel = ZIL_BURSTS;
break; break;
} }
} else { } else {
@ -2905,28 +2922,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* try and pack as many itxs into as few lwbs as * try and pack as many itxs into as few lwbs as
* possible, without significantly impacting the latency * possible, without significantly impacting the latency
* of each individual itx. * of each individual itx.
*
* If we had no already running or open LWBs, it can be
* the workload is single-threaded. And if the ZIL write
* latency is very small or if the LWB is almost full, it
* may be cheaper to bypass the delay.
*/ */
if (lwb->lwb_state == LWB_STATE_OPENED && first) { if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
hrtime_t sleep = zilog->zl_last_lwb_latency * list_insert_tail(ilwbs, lwb);
zfs_commit_timeout_pct / 100; lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
if (sleep < zil_min_commit_timeout || zil_burst_done(zilog);
lwb->lwb_nmax - lwb->lwb_nused < if (lwb == NULL) {
lwb->lwb_nmax / 8) { while ((lwb = list_remove_head(ilwbs)) != NULL)
list_insert_tail(ilwbs, lwb); zil_lwb_write_issue(zilog, lwb);
lwb = zil_lwb_write_close(zilog, lwb, zil_commit_writer_stall(zilog);
LWB_STATE_NEW);
zilog->zl_cur_used = 0;
if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs))
!= NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
}
} }
} }
} }
@ -3084,19 +3088,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
/* zil_burst_done(zilog);
* Since the lwb's zio hadn't been issued by the time this thread
* reached its timeout, we reset the zilog's "zl_cur_used" field
* to influence the zil block size selection algorithm.
*
* By having to issue the lwb's zio here, it means the size of the
* lwb was too large, given the incoming throughput of itxs. By
* setting "zl_cur_used" to zero, we communicate this fact to the
* block size selection algorithm, so it can take this information
* into account, and potentially select a smaller size for the
* next lwb block that is allocated.
*/
zilog->zl_cur_used = 0;
if (nlwb == NULL) { if (nlwb == NULL) {
/* /*
@ -4214,9 +4206,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
"ZIL block open timeout percentage"); "ZIL block open timeout percentage");
ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
"Minimum delay we care for ZIL block commit");
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
"Disable intent logging replay"); "Disable intent logging replay");