mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 03:09:34 +03:00
ZIL: Detect single-threaded workloads
... by checking that previous block is fully written and flushed. It allows to skip commit delays since we can give up on aggregation in that case. This removes zil_min_commit_timeout parameter, since for single-threaded workloads it is not needed at all, while on very fast devices even some multi-threaded workloads may get detected as single-threaded and still bypass the wait. To give multi-threaded workloads more aggregation chances increase zfs_commit_timeout_pct from 5 to 10%, as they should suffer less from additional latency. Also single-threaded workloads detection allows in perspective better prediction of the next block size. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Prakash Surya <prakash.surya@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15381
This commit is contained in:
parent
3c5f354a8c
commit
7ea8331009
@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
|
|||||||
avl_node_t zv_node; /* AVL tree linkage */
|
avl_node_t zv_node; /* AVL tree linkage */
|
||||||
} zil_vdev_node_t;
|
} zil_vdev_node_t;
|
||||||
|
|
||||||
|
#define ZIL_BURSTS 8
|
||||||
#define ZIL_PREV_BLKS 16
|
#define ZIL_PREV_BLKS 16
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -222,8 +223,9 @@ struct zilog {
|
|||||||
clock_t zl_replay_time; /* lbolt of when replay started */
|
clock_t zl_replay_time; /* lbolt of when replay started */
|
||||||
uint64_t zl_replay_blks; /* number of log blocks replayed */
|
uint64_t zl_replay_blks; /* number of log blocks replayed */
|
||||||
zil_header_t zl_old_header; /* debugging aid */
|
zil_header_t zl_old_header; /* debugging aid */
|
||||||
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
|
uint_t zl_parallel; /* workload is multi-threaded */
|
||||||
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
|
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
|
||||||
|
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
|
||||||
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
|
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
|
||||||
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
|
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
|
||||||
|
|
||||||
|
@ -799,7 +799,7 @@ Note that this should not be set below the ZED thresholds
|
|||||||
(currently 10 checksums over 10 seconds)
|
(currently 10 checksums over 10 seconds)
|
||||||
or else the daemon may not trigger any action.
|
or else the daemon may not trigger any action.
|
||||||
.
|
.
|
||||||
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
|
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
|
||||||
This controls the amount of time that a ZIL block (lwb) will remain "open"
|
This controls the amount of time that a ZIL block (lwb) will remain "open"
|
||||||
when it isn't "full", and it has a thread waiting for it to be committed to
|
when it isn't "full", and it has a thread waiting for it to be committed to
|
||||||
stable storage.
|
stable storage.
|
||||||
@ -2206,13 +2206,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
|
|||||||
It tunes a tradeoff between additional memory copy and possibly worse log
|
It tunes a tradeoff between additional memory copy and possibly worse log
|
||||||
space efficiency vs additional range lock/unlock.
|
space efficiency vs additional range lock/unlock.
|
||||||
.
|
.
|
||||||
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
|
|
||||||
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
|
|
||||||
waiting for more records.
|
|
||||||
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
|
|
||||||
increasing log latency above allowed by
|
|
||||||
.Sy zfs_commit_timeout_pct .
|
|
||||||
.
|
|
||||||
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||||
Disable the cache flush commands that are normally sent to disk by
|
Disable the cache flush commands that are normally sent to disk by
|
||||||
the ZIL after an LWB write has completed.
|
the ZIL after an LWB write has completed.
|
||||||
|
@ -91,15 +91,7 @@
|
|||||||
* committed to stable storage. Please refer to the zil_commit_waiter()
|
* committed to stable storage. Please refer to the zil_commit_waiter()
|
||||||
* function (and the comments within it) for more details.
|
* function (and the comments within it) for more details.
|
||||||
*/
|
*/
|
||||||
static uint_t zfs_commit_timeout_pct = 5;
|
static uint_t zfs_commit_timeout_pct = 10;
|
||||||
|
|
||||||
/*
|
|
||||||
* Minimal time we care to delay commit waiting for more ZIL records.
|
|
||||||
* At least FreeBSD kernel can't sleep for less than 2us at its best.
|
|
||||||
* So requests to sleep for less then 5us is a waste of CPU time with
|
|
||||||
* a risk of significant log latency increase due to oversleep.
|
|
||||||
*/
|
|
||||||
static uint64_t zil_min_commit_timeout = 5000;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* See zil.h for more information about these fields.
|
* See zil.h for more information about these fields.
|
||||||
@ -2732,6 +2724,19 @@ zil_commit_writer_stall(zilog_t *zilog)
|
|||||||
ASSERT(list_is_empty(&zilog->zl_lwb_list));
|
ASSERT(list_is_empty(&zilog->zl_lwb_list));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
zil_burst_done(zilog_t *zilog)
|
||||||
|
{
|
||||||
|
if (!list_is_empty(&zilog->zl_itx_commit_list) ||
|
||||||
|
zilog->zl_cur_used == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (zilog->zl_parallel)
|
||||||
|
zilog->zl_parallel--;
|
||||||
|
|
||||||
|
zilog->zl_cur_used = 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function will traverse the commit list, creating new lwbs as
|
* This function will traverse the commit list, creating new lwbs as
|
||||||
* needed, and committing the itxs from the commit list to these newly
|
* needed, and committing the itxs from the commit list to these newly
|
||||||
@ -2746,7 +2751,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
|||||||
list_t nolwb_waiters;
|
list_t nolwb_waiters;
|
||||||
lwb_t *lwb, *plwb;
|
lwb_t *lwb, *plwb;
|
||||||
itx_t *itx;
|
itx_t *itx;
|
||||||
boolean_t first = B_TRUE;
|
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
||||||
|
|
||||||
@ -2772,9 +2776,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
|||||||
zil_commit_activate_saxattr_feature(zilog);
|
zil_commit_activate_saxattr_feature(zilog);
|
||||||
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
|
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
|
||||||
lwb->lwb_state == LWB_STATE_OPENED);
|
lwb->lwb_state == LWB_STATE_OPENED);
|
||||||
first = (lwb->lwb_state == LWB_STATE_NEW) &&
|
|
||||||
((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
|
/*
|
||||||
plwb->lwb_state == LWB_STATE_FLUSH_DONE);
|
* If the lwb is still opened, it means the workload is really
|
||||||
|
* multi-threaded and we won the chance of write aggregation.
|
||||||
|
* If it is not opened yet, but previous lwb is still not
|
||||||
|
* flushed, it still means the workload is multi-threaded, but
|
||||||
|
* there was too much time between the commits to aggregate, so
|
||||||
|
* we try aggregation next times, but without too much hopes.
|
||||||
|
*/
|
||||||
|
if (lwb->lwb_state == LWB_STATE_OPENED) {
|
||||||
|
zilog->zl_parallel = ZIL_BURSTS;
|
||||||
|
} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
|
||||||
|
!= NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
|
||||||
|
zilog->zl_parallel = MAX(zilog->zl_parallel,
|
||||||
|
ZIL_BURSTS / 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
|
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
|
||||||
@ -2849,7 +2866,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
|||||||
* Our lwb is done, leave the rest of
|
* Our lwb is done, leave the rest of
|
||||||
* itx list to somebody else who care.
|
* itx list to somebody else who care.
|
||||||
*/
|
*/
|
||||||
first = B_FALSE;
|
zilog->zl_parallel = ZIL_BURSTS;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -2941,31 +2958,18 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
|||||||
* try and pack as many itxs into as few lwbs as
|
* try and pack as many itxs into as few lwbs as
|
||||||
* possible, without significantly impacting the latency
|
* possible, without significantly impacting the latency
|
||||||
* of each individual itx.
|
* of each individual itx.
|
||||||
*
|
|
||||||
* If we had no already running or open LWBs, it can be
|
|
||||||
* the workload is single-threaded. And if the ZIL write
|
|
||||||
* latency is very small or if the LWB is almost full, it
|
|
||||||
* may be cheaper to bypass the delay.
|
|
||||||
*/
|
*/
|
||||||
if (lwb->lwb_state == LWB_STATE_OPENED && first) {
|
if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
|
||||||
hrtime_t sleep = zilog->zl_last_lwb_latency *
|
|
||||||
zfs_commit_timeout_pct / 100;
|
|
||||||
if (sleep < zil_min_commit_timeout ||
|
|
||||||
lwb->lwb_nmax - lwb->lwb_nused <
|
|
||||||
lwb->lwb_nmax / 8) {
|
|
||||||
list_insert_tail(ilwbs, lwb);
|
list_insert_tail(ilwbs, lwb);
|
||||||
lwb = zil_lwb_write_close(zilog, lwb,
|
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
|
||||||
LWB_STATE_NEW);
|
zil_burst_done(zilog);
|
||||||
zilog->zl_cur_used = 0;
|
|
||||||
if (lwb == NULL) {
|
if (lwb == NULL) {
|
||||||
while ((lwb = list_remove_head(ilwbs))
|
while ((lwb = list_remove_head(ilwbs)) != NULL)
|
||||||
!= NULL)
|
|
||||||
zil_lwb_write_issue(zilog, lwb);
|
zil_lwb_write_issue(zilog, lwb);
|
||||||
zil_commit_writer_stall(zilog);
|
zil_commit_writer_stall(zilog);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3120,19 +3124,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||||||
|
|
||||||
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
|
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
|
||||||
|
|
||||||
/*
|
zil_burst_done(zilog);
|
||||||
* Since the lwb's zio hadn't been issued by the time this thread
|
|
||||||
* reached its timeout, we reset the zilog's "zl_cur_used" field
|
|
||||||
* to influence the zil block size selection algorithm.
|
|
||||||
*
|
|
||||||
* By having to issue the lwb's zio here, it means the size of the
|
|
||||||
* lwb was too large, given the incoming throughput of itxs. By
|
|
||||||
* setting "zl_cur_used" to zero, we communicate this fact to the
|
|
||||||
* block size selection algorithm, so it can take this information
|
|
||||||
* into account, and potentially select a smaller size for the
|
|
||||||
* next lwb block that is allocated.
|
|
||||||
*/
|
|
||||||
zilog->zl_cur_used = 0;
|
|
||||||
|
|
||||||
if (nlwb == NULL) {
|
if (nlwb == NULL) {
|
||||||
/*
|
/*
|
||||||
@ -4250,9 +4242,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
|
|||||||
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
|
||||||
"ZIL block open timeout percentage");
|
"ZIL block open timeout percentage");
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
|
|
||||||
"Minimum delay we care for ZIL block commit");
|
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
|
||||||
"Disable intent logging replay");
|
"Disable intent logging replay");
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user