mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-27 03:19:35 +03:00
OpenZFS 9962 - zil_commit should omit cache thrash
As a result of the changes made in 8585, it's possible for an excessive amount of vdev flush commands to be issued under some workloads. Specifically, when the workload consists of mostly async write activity, interspersed with some sync write and/or fsync activity, we can end up issuing more flush commands to the underlying storage than is actually necessary. As a result of these flush commands, the write latency and overall throughput of the pool can be poorly impacted (latency increases, throughput decreases). Currently, any time an lwb completes, the vdev(s) written to as a result of that lwb will be issued a flush command. The intenion is so the data written to that vdev is on stable storage, prior to communicating to any waiting threads that their data is safe on disk. The problem with this scheme, is that sometimes an lwb will not have any threads waiting for it to complete. This can occur when there's async activity that gets "converted" to sync requests, as a result of calling the zil_async_to_sync() function via zil_commit_impl(). When this occurs, the current code may issue many lwbs that don't have waiters associated with them, resulting in many flush commands, potentially to the same vdev(s). For example, given a pool with a single vdev, and a single fsync() call that results in 10 lwbs being written out (e.g. due to other async writes), that will result in 10 flush commands to that single vdev (a flush issued after each lwb write completes). Ideally, we'd only issue a single flush command to that vdev, after all 10 lwb writes completed. Further, and most important as it pertains to this change, since the flush commands are often very impactful to the performance of the pool's underlying storage, unnecessarily issuing these flush commands can poorly impact the performance of the lwb writes themselves. Thus, we need to avoid issuing flush commands when possible, in order to acheive the best possible performance out of the pool's underlying storage. This change attempts to address this problem by changing the ZIL's logic to only issue a vdev flush command when it detects an lwb that has a thread waiting for it to complete. When an lwb does not have threads waiting for it, the responsibility of issuing the flush command to the vdevs involved with that lwb's write is passed on to the "next" lwb. It's only once a write for an lwb with waiters completes, do we issue the vdev flush command(s). As a result, now when we issue the flush(s), we will issue them to the vdevs involved with that specific lwb's write, but potentially also to vdevs involved with "previous" lwb writes (i.e. if the previous lwbs did not have waiters associated with them). Thus, in our prior example with 10 lwbs, it's only once the last lwb completes (which will be the lwb containing the waiter for the thread that called fsync) will we issue the vdev flush command; all of the other lwbs will find they have no waiters, so they'll pass the responsibility of the flush to the "next" lwb (until reaching the last lwb that has the waiter). Porting Notes: * Reconciled conflicts with the fastwrite feature. Authored by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Joshua M. Clulow <josh@sysmgr.org> Ported-by: Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/9962 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/545190c6 Closes #8188
This commit is contained in:
parent
53b1f5eac6
commit
900d09b285
@ -2157,6 +2157,7 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
|
|||||||
* ZIL get_data callbacks
|
* ZIL get_data callbacks
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* ARGSUSED */
|
||||||
static void
|
static void
|
||||||
ztest_get_done(zgd_t *zgd, int error)
|
ztest_get_done(zgd_t *zgd, int error)
|
||||||
{
|
{
|
||||||
@ -2169,9 +2170,6 @@ ztest_get_done(zgd_t *zgd, int error)
|
|||||||
ztest_range_unlock((rl_t *)zgd->zgd_lr);
|
ztest_range_unlock((rl_t *)zgd->zgd_lr);
|
||||||
ztest_object_unlock(zd, object);
|
ztest_object_unlock(zd, object);
|
||||||
|
|
||||||
if (error == 0 && zgd->zgd_bp)
|
|
||||||
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
|
||||||
|
|
||||||
umem_free(zgd, sizeof (*zgd));
|
umem_free(zgd, sizeof (*zgd));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,10 +47,11 @@ extern "C" {
|
|||||||
* via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
|
* via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
|
||||||
* be held when making this transition.
|
* be held when making this transition.
|
||||||
*
|
*
|
||||||
* After the lwb's zio completes, and the vdev's are flushed, the lwb
|
* After the lwb's write zio completes, it transitions into the "write
|
||||||
* will transition into the "done" state via zil_lwb_write_done(). When
|
* done" state via zil_lwb_write_done(); and then into the "flush done"
|
||||||
* transitioning from "issued" to "done", the zilog's "zl_lock" must be
|
* state via zil_lwb_flush_vdevs_done(). When transitioning from
|
||||||
* held, *not* the "zl_issuer_lock".
|
* "issued" to "write done", and then from "write done" to "flush done",
|
||||||
|
* the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock".
|
||||||
*
|
*
|
||||||
* The zilog's "zl_issuer_lock" can become heavily contended in certain
|
* The zilog's "zl_issuer_lock" can become heavily contended in certain
|
||||||
* workloads, so we specifically avoid acquiring that lock when
|
* workloads, so we specifically avoid acquiring that lock when
|
||||||
@ -67,13 +68,14 @@ extern "C" {
|
|||||||
* "zl_issuer_lock" will prevent a concurrent thread from transitioning
|
* "zl_issuer_lock" will prevent a concurrent thread from transitioning
|
||||||
* that lwb to the "issued" state. Likewise, if an lwb is already in the
|
* that lwb to the "issued" state. Likewise, if an lwb is already in the
|
||||||
* "issued" state, holding the "zl_lock" will prevent a concurrent
|
* "issued" state, holding the "zl_lock" will prevent a concurrent
|
||||||
* thread from transitioning that lwb to the "done" state.
|
* thread from transitioning that lwb to the "write done" state.
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
LWB_STATE_CLOSED,
|
LWB_STATE_CLOSED,
|
||||||
LWB_STATE_OPENED,
|
LWB_STATE_OPENED,
|
||||||
LWB_STATE_ISSUED,
|
LWB_STATE_ISSUED,
|
||||||
LWB_STATE_DONE,
|
LWB_STATE_WRITE_DONE,
|
||||||
|
LWB_STATE_FLUSH_DONE,
|
||||||
LWB_NUM_STATES
|
LWB_NUM_STATES
|
||||||
} lwb_state_t;
|
} lwb_state_t;
|
||||||
|
|
||||||
|
@ -1757,6 +1757,15 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
|||||||
dmu_sync_arg_t *dsa = varg;
|
dmu_sync_arg_t *dsa = varg;
|
||||||
dbuf_dirty_record_t *dr = dsa->dsa_dr;
|
dbuf_dirty_record_t *dr = dsa->dsa_dr;
|
||||||
dmu_buf_impl_t *db = dr->dr_dbuf;
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
||||||
|
zgd_t *zgd = dsa->dsa_zgd;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Record the vdev(s) backing this blkptr so they can be flushed after
|
||||||
|
* the writes for the lwb have completed.
|
||||||
|
*/
|
||||||
|
if (zio->io_error == 0) {
|
||||||
|
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
||||||
|
}
|
||||||
|
|
||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
|
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
|
||||||
@ -1806,14 +1815,23 @@ dmu_sync_late_arrival_done(zio_t *zio)
|
|||||||
{
|
{
|
||||||
blkptr_t *bp = zio->io_bp;
|
blkptr_t *bp = zio->io_bp;
|
||||||
dmu_sync_arg_t *dsa = zio->io_private;
|
dmu_sync_arg_t *dsa = zio->io_private;
|
||||||
ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
|
zgd_t *zgd = dsa->dsa_zgd;
|
||||||
|
|
||||||
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
|
if (zio->io_error == 0) {
|
||||||
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
/*
|
||||||
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
|
* Record the vdev(s) backing this blkptr so they can be
|
||||||
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
* flushed after the writes for the lwb have completed.
|
||||||
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
*/
|
||||||
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
||||||
|
|
||||||
|
if (!BP_IS_HOLE(bp)) {
|
||||||
|
ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
|
||||||
|
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
|
||||||
|
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
|
||||||
|
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
|
||||||
|
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
|
||||||
|
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dmu_tx_commit(dsa->dsa_tx);
|
dmu_tx_commit(dsa->dsa_tx);
|
||||||
|
@ -976,6 +976,7 @@ zfs_iput_async(struct inode *ip)
|
|||||||
iput(ip);
|
iput(ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ARGSUSED */
|
||||||
void
|
void
|
||||||
zfs_get_done(zgd_t *zgd, int error)
|
zfs_get_done(zgd_t *zgd, int error)
|
||||||
{
|
{
|
||||||
@ -992,9 +993,6 @@ zfs_get_done(zgd_t *zgd, int error)
|
|||||||
*/
|
*/
|
||||||
zfs_iput_async(ZTOI(zp));
|
zfs_iput_async(ZTOI(zp));
|
||||||
|
|
||||||
if (error == 0 && zgd->zgd_bp)
|
|
||||||
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
|
||||||
|
|
||||||
kmem_free(zgd, sizeof (zgd_t));
|
kmem_free(zgd, sizeof (zgd_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1118,11 +1116,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
|||||||
* TX_WRITE2 relies on the data previously
|
* TX_WRITE2 relies on the data previously
|
||||||
* written by the TX_WRITE that caused
|
* written by the TX_WRITE that caused
|
||||||
* EALREADY. We zero out the BP because
|
* EALREADY. We zero out the BP because
|
||||||
* it is the old, currently-on-disk BP,
|
* it is the old, currently-on-disk BP.
|
||||||
* so there's no need to zio_flush() its
|
|
||||||
* vdevs (flushing would needlesly hurt
|
|
||||||
* performance, and doesn't work on
|
|
||||||
* indirect vdevs).
|
|
||||||
*/
|
*/
|
||||||
zgd->zgd_bp = NULL;
|
zgd->zgd_bp = NULL;
|
||||||
BP_ZERO(bp);
|
BP_ZERO(bp);
|
||||||
|
218
module/zfs/zil.c
218
module/zfs/zil.c
@ -588,7 +588,7 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
|
|||||||
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
|
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
|
||||||
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
|
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
|
||||||
ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
|
ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
|
||||||
lwb->lwb_state == LWB_STATE_DONE);
|
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clear the zilog's field to indicate this lwb is no longer
|
* Clear the zilog's field to indicate this lwb is no longer
|
||||||
@ -1011,7 +1011,8 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
|
|||||||
ASSERT3P(zcw->zcw_lwb, ==, NULL);
|
ASSERT3P(zcw->zcw_lwb, ==, NULL);
|
||||||
ASSERT3P(lwb, !=, NULL);
|
ASSERT3P(lwb, !=, NULL);
|
||||||
ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
|
ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
|
||||||
lwb->lwb_state == LWB_STATE_ISSUED);
|
lwb->lwb_state == LWB_STATE_ISSUED ||
|
||||||
|
lwb->lwb_state == LWB_STATE_WRITE_DONE);
|
||||||
|
|
||||||
list_insert_tail(&lwb->lwb_waiters, zcw);
|
list_insert_tail(&lwb->lwb_waiters, zcw);
|
||||||
zcw->zcw_lwb = lwb;
|
zcw->zcw_lwb = lwb;
|
||||||
@ -1057,6 +1058,42 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
|
|||||||
mutex_exit(&lwb->lwb_vdev_lock);
|
mutex_exit(&lwb->lwb_vdev_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
|
||||||
|
{
|
||||||
|
avl_tree_t *src = &lwb->lwb_vdev_tree;
|
||||||
|
avl_tree_t *dst = &nlwb->lwb_vdev_tree;
|
||||||
|
void *cookie = NULL;
|
||||||
|
zil_vdev_node_t *zv;
|
||||||
|
|
||||||
|
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
|
||||||
|
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
|
||||||
|
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
|
||||||
|
* not need the protection of lwb_vdev_lock (it will only be modified
|
||||||
|
* while holding zilog->zl_lock) as its writes and those of its
|
||||||
|
* children have all completed. The younger 'nlwb' may be waiting on
|
||||||
|
* future writes to additional vdevs.
|
||||||
|
*/
|
||||||
|
mutex_enter(&nlwb->lwb_vdev_lock);
|
||||||
|
/*
|
||||||
|
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
|
||||||
|
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
|
||||||
|
*/
|
||||||
|
while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
|
||||||
|
avl_index_t where;
|
||||||
|
|
||||||
|
if (avl_find(dst, zv, &where) == NULL) {
|
||||||
|
avl_insert(dst, zv, where);
|
||||||
|
} else {
|
||||||
|
kmem_free(zv, sizeof (*zv));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mutex_exit(&nlwb->lwb_vdev_lock);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
|
zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
|
||||||
{
|
{
|
||||||
@ -1064,9 +1101,13 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is a called after all VDEVs associated with a given lwb
|
* This function is a called after all vdevs associated with a given lwb
|
||||||
* write have completed their DKIOCFLUSHWRITECACHE command; or as soon
|
* write have completed their DKIOCFLUSHWRITECACHE command; or as soon
|
||||||
* as the lwb write completes, if "zil_nocacheflush" is set.
|
* as the lwb write completes, if "zil_nocacheflush" is set. Further,
|
||||||
|
* all "previous" lwb's will have completed before this function is
|
||||||
|
* called; i.e. this function is called for all previous lwbs before
|
||||||
|
* it's called for "this" lwb (enforced via zio the dependencies
|
||||||
|
* configured in zil_lwb_set_zio_dependency()).
|
||||||
*
|
*
|
||||||
* The intention is for this function to be called as soon as the
|
* The intention is for this function to be called as soon as the
|
||||||
* contents of an lwb are considered "stable" on disk, and will survive
|
* contents of an lwb are considered "stable" on disk, and will survive
|
||||||
@ -1104,7 +1145,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
|||||||
zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
|
zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
|
||||||
|
|
||||||
lwb->lwb_root_zio = NULL;
|
lwb->lwb_root_zio = NULL;
|
||||||
lwb->lwb_state = LWB_STATE_DONE;
|
|
||||||
|
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
|
||||||
|
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
|
||||||
|
|
||||||
if (zilog->zl_last_lwb_opened == lwb) {
|
if (zilog->zl_last_lwb_opened == lwb) {
|
||||||
/*
|
/*
|
||||||
@ -1150,14 +1193,17 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is called when an lwb write completes. This means, this specific
|
* This is called when an lwb's write zio completes. The callback's
|
||||||
* lwb was written to disk, and all dependent lwb have also been
|
* purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
|
||||||
* written to disk.
|
* in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
|
||||||
*
|
* in writing out this specific lwb's data, and in the case that cache
|
||||||
* At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
|
* flushes have been deferred, vdevs involved in writing the data for
|
||||||
* the VDEVs involved in writing out this specific lwb. The lwb will be
|
* previous lwbs. The writes corresponding to all the vdevs in the
|
||||||
* "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
|
* lwb_vdev_tree will have completed by the time this is called, due to
|
||||||
* zio completion callback for the lwb's root zio.
|
* the zio dependencies configured in zil_lwb_set_zio_dependency(),
|
||||||
|
* which takes deferred flushes into account. The lwb will be "done"
|
||||||
|
* once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
|
||||||
|
* completion callback for the lwb's root zio.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
zil_lwb_write_done(zio_t *zio)
|
zil_lwb_write_done(zio_t *zio)
|
||||||
@ -1168,6 +1214,7 @@ zil_lwb_write_done(zio_t *zio)
|
|||||||
avl_tree_t *t = &lwb->lwb_vdev_tree;
|
avl_tree_t *t = &lwb->lwb_vdev_tree;
|
||||||
void *cookie = NULL;
|
void *cookie = NULL;
|
||||||
zil_vdev_node_t *zv;
|
zil_vdev_node_t *zv;
|
||||||
|
lwb_t *nlwb;
|
||||||
|
|
||||||
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
|
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
|
||||||
|
|
||||||
@ -1181,11 +1228,12 @@ zil_lwb_write_done(zio_t *zio)
|
|||||||
|
|
||||||
abd_put(zio->io_abd);
|
abd_put(zio->io_abd);
|
||||||
|
|
||||||
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
|
|
||||||
|
|
||||||
mutex_enter(&zilog->zl_lock);
|
mutex_enter(&zilog->zl_lock);
|
||||||
|
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
|
||||||
|
lwb->lwb_state = LWB_STATE_WRITE_DONE;
|
||||||
lwb->lwb_write_zio = NULL;
|
lwb->lwb_write_zio = NULL;
|
||||||
lwb->lwb_fastwrite = FALSE;
|
lwb->lwb_fastwrite = FALSE;
|
||||||
|
nlwb = list_next(&zilog->zl_lwb_list, lwb);
|
||||||
mutex_exit(&zilog->zl_lock);
|
mutex_exit(&zilog->zl_lock);
|
||||||
|
|
||||||
if (avl_numnodes(t) == 0)
|
if (avl_numnodes(t) == 0)
|
||||||
@ -1204,6 +1252,27 @@ zil_lwb_write_done(zio_t *zio)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this lwb does not have any threads waiting for it to
|
||||||
|
* complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
|
||||||
|
* command to the vdevs written to by "this" lwb, and instead
|
||||||
|
* rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
|
||||||
|
* command for those vdevs. Thus, we merge the vdev tree of
|
||||||
|
* "this" lwb with the vdev tree of the "next" lwb in the list,
|
||||||
|
* and assume the "next" lwb will handle flushing the vdevs (or
|
||||||
|
* deferring the flush(s) again).
|
||||||
|
*
|
||||||
|
* This is a useful performance optimization, especially for
|
||||||
|
* workloads with lots of async write activity and few sync
|
||||||
|
* write and/or fsync activity, as it has the potential to
|
||||||
|
* coalesce multiple flush commands to a vdev into one.
|
||||||
|
*/
|
||||||
|
if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
|
||||||
|
zil_lwb_flush_defer(lwb, nlwb);
|
||||||
|
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
|
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
|
||||||
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
|
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
|
||||||
if (vd != NULL)
|
if (vd != NULL)
|
||||||
@ -1212,6 +1281,73 @@ zil_lwb_write_done(zio_t *zio)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
|
||||||
|
{
|
||||||
|
lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
|
||||||
|
|
||||||
|
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
||||||
|
ASSERT(MUTEX_HELD(&zilog->zl_lock));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The zilog's "zl_last_lwb_opened" field is used to build the
|
||||||
|
* lwb/zio dependency chain, which is used to preserve the
|
||||||
|
* ordering of lwb completions that is required by the semantics
|
||||||
|
* of the ZIL. Each new lwb zio becomes a parent of the
|
||||||
|
* "previous" lwb zio, such that the new lwb's zio cannot
|
||||||
|
* complete until the "previous" lwb's zio completes.
|
||||||
|
*
|
||||||
|
* This is required by the semantics of zil_commit(); the commit
|
||||||
|
* waiters attached to the lwbs will be woken in the lwb zio's
|
||||||
|
* completion callback, so this zio dependency graph ensures the
|
||||||
|
* waiters are woken in the correct order (the same order the
|
||||||
|
* lwbs were created).
|
||||||
|
*/
|
||||||
|
if (last_lwb_opened != NULL &&
|
||||||
|
last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
|
||||||
|
ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
|
||||||
|
last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
|
||||||
|
last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
|
||||||
|
|
||||||
|
ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
|
||||||
|
zio_add_child(lwb->lwb_root_zio,
|
||||||
|
last_lwb_opened->lwb_root_zio);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the previous lwb's write hasn't already completed,
|
||||||
|
* we also want to order the completion of the lwb write
|
||||||
|
* zios (above, we only order the completion of the lwb
|
||||||
|
* root zios). This is required because of how we can
|
||||||
|
* defer the DKIOCFLUSHWRITECACHE commands for each lwb.
|
||||||
|
*
|
||||||
|
* When the DKIOCFLUSHWRITECACHE commands are defered,
|
||||||
|
* the previous lwb will rely on this lwb to flush the
|
||||||
|
* vdevs written to by that previous lwb. Thus, we need
|
||||||
|
* to ensure this lwb doesn't issue the flush until
|
||||||
|
* after the previous lwb's write completes. We ensure
|
||||||
|
* this ordering by setting the zio parent/child
|
||||||
|
* relationship here.
|
||||||
|
*
|
||||||
|
* Without this relationship on the lwb's write zio,
|
||||||
|
* it's possible for this lwb's write to complete prior
|
||||||
|
* to the previous lwb's write completing; and thus, the
|
||||||
|
* vdevs for the previous lwb would be flushed prior to
|
||||||
|
* that lwb's data being written to those vdevs (the
|
||||||
|
* vdevs are flushed in the lwb write zio's completion
|
||||||
|
* handler, zil_lwb_write_done()).
|
||||||
|
*/
|
||||||
|
if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
|
||||||
|
ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
|
||||||
|
last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
|
||||||
|
|
||||||
|
ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
|
||||||
|
zio_add_child(lwb->lwb_write_zio,
|
||||||
|
last_lwb_opened->lwb_write_zio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function's purpose is to "open" an lwb such that it is ready to
|
* This function's purpose is to "open" an lwb such that it is ready to
|
||||||
* accept new itxs being committed to it. To do this, the lwb's zio
|
* accept new itxs being committed to it. To do this, the lwb's zio
|
||||||
@ -1263,30 +1399,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
|
|||||||
|
|
||||||
lwb->lwb_state = LWB_STATE_OPENED;
|
lwb->lwb_state = LWB_STATE_OPENED;
|
||||||
|
|
||||||
/*
|
zil_lwb_set_zio_dependency(zilog, lwb);
|
||||||
* The zilog's "zl_last_lwb_opened" field is used to
|
|
||||||
* build the lwb/zio dependency chain, which is used to
|
|
||||||
* preserve the ordering of lwb completions that is
|
|
||||||
* required by the semantics of the ZIL. Each new lwb
|
|
||||||
* zio becomes a parent of the "previous" lwb zio, such
|
|
||||||
* that the new lwb's zio cannot complete until the
|
|
||||||
* "previous" lwb's zio completes.
|
|
||||||
*
|
|
||||||
* This is required by the semantics of zil_commit();
|
|
||||||
* the commit waiters attached to the lwbs will be woken
|
|
||||||
* in the lwb zio's completion callback, so this zio
|
|
||||||
* dependency graph ensures the waiters are woken in the
|
|
||||||
* correct order (the same order the lwbs were created).
|
|
||||||
*/
|
|
||||||
lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
|
|
||||||
if (last_lwb_opened != NULL &&
|
|
||||||
last_lwb_opened->lwb_state != LWB_STATE_DONE) {
|
|
||||||
ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
|
|
||||||
last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
|
|
||||||
ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
|
|
||||||
zio_add_child(lwb->lwb_root_zio,
|
|
||||||
last_lwb_opened->lwb_root_zio);
|
|
||||||
}
|
|
||||||
zilog->zl_last_lwb_opened = lwb;
|
zilog->zl_last_lwb_opened = lwb;
|
||||||
}
|
}
|
||||||
mutex_exit(&zilog->zl_lock);
|
mutex_exit(&zilog->zl_lock);
|
||||||
@ -2012,7 +2125,8 @@ zil_prune_commit_list(zilog_t *zilog)
|
|||||||
mutex_enter(&zilog->zl_lock);
|
mutex_enter(&zilog->zl_lock);
|
||||||
|
|
||||||
lwb_t *last_lwb = zilog->zl_last_lwb_opened;
|
lwb_t *last_lwb = zilog->zl_last_lwb_opened;
|
||||||
if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
|
if (last_lwb == NULL ||
|
||||||
|
last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
|
||||||
/*
|
/*
|
||||||
* All of the itxs this waiter was waiting on
|
* All of the itxs this waiter was waiting on
|
||||||
* must have already completed (or there were
|
* must have already completed (or there were
|
||||||
@ -2095,7 +2209,8 @@ zil_process_commit_list(zilog_t *zilog)
|
|||||||
lwb = zil_create(zilog);
|
lwb = zil_create(zilog);
|
||||||
} else {
|
} else {
|
||||||
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
|
||||||
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
|
||||||
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
|
while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
|
||||||
@ -2217,7 +2332,8 @@ zil_process_commit_list(zilog_t *zilog)
|
|||||||
ASSERT(list_is_empty(&nolwb_waiters));
|
ASSERT(list_is_empty(&nolwb_waiters));
|
||||||
ASSERT3P(lwb, !=, NULL);
|
ASSERT3P(lwb, !=, NULL);
|
||||||
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
|
||||||
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
|
||||||
|
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* At this point, the ZIL block pointed at by the "lwb"
|
* At this point, the ZIL block pointed at by the "lwb"
|
||||||
@ -2340,7 +2456,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||||||
* acquiring it when it's not necessary to do so.
|
* acquiring it when it's not necessary to do so.
|
||||||
*/
|
*/
|
||||||
if (lwb->lwb_state == LWB_STATE_ISSUED ||
|
if (lwb->lwb_state == LWB_STATE_ISSUED ||
|
||||||
lwb->lwb_state == LWB_STATE_DONE)
|
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
|
||||||
|
lwb->lwb_state == LWB_STATE_FLUSH_DONE)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2388,7 +2505,8 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||||||
* more details on the lwb states, and locking requirements.
|
* more details on the lwb states, and locking requirements.
|
||||||
*/
|
*/
|
||||||
if (lwb->lwb_state == LWB_STATE_ISSUED ||
|
if (lwb->lwb_state == LWB_STATE_ISSUED ||
|
||||||
lwb->lwb_state == LWB_STATE_DONE)
|
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
|
||||||
|
lwb->lwb_state == LWB_STATE_FLUSH_DONE)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
|
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
|
||||||
@ -2561,7 +2679,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||||||
|
|
||||||
IMPLY(lwb != NULL,
|
IMPLY(lwb != NULL,
|
||||||
lwb->lwb_state == LWB_STATE_ISSUED ||
|
lwb->lwb_state == LWB_STATE_ISSUED ||
|
||||||
lwb->lwb_state == LWB_STATE_DONE);
|
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
|
||||||
|
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
|
||||||
cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
|
cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3256,15 +3375,14 @@ zil_suspend(const char *osname, void **cookiep)
|
|||||||
* to disk before proceeding. If we used zil_commit instead, it
|
* to disk before proceeding. If we used zil_commit instead, it
|
||||||
* would just call txg_wait_synced(), because zl_suspend is set.
|
* would just call txg_wait_synced(), because zl_suspend is set.
|
||||||
* txg_wait_synced() doesn't wait for these lwb's to be
|
* txg_wait_synced() doesn't wait for these lwb's to be
|
||||||
* LWB_STATE_DONE before returning.
|
* LWB_STATE_FLUSH_DONE before returning.
|
||||||
*/
|
*/
|
||||||
zil_commit_impl(zilog, 0);
|
zil_commit_impl(zilog, 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now that we've ensured all lwb's are LWB_STATE_DONE,
|
* Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
|
||||||
* txg_wait_synced() will be called from within zil_destroy(),
|
* use txg_wait_synced() to ensure the data from the zilog has
|
||||||
* which will ensure the data from the zilog has migrated to the
|
* migrated to the main pool before calling zil_destroy().
|
||||||
* main pool before it returns.
|
|
||||||
*/
|
*/
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||||
|
|
||||||
|
@ -1030,6 +1030,7 @@ out:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ARGSUSED */
|
||||||
static void
|
static void
|
||||||
zvol_get_done(zgd_t *zgd, int error)
|
zvol_get_done(zgd_t *zgd, int error)
|
||||||
{
|
{
|
||||||
@ -1038,9 +1039,6 @@ zvol_get_done(zgd_t *zgd, int error)
|
|||||||
|
|
||||||
rangelock_exit(zgd->zgd_lr);
|
rangelock_exit(zgd->zgd_lr);
|
||||||
|
|
||||||
if (error == 0 && zgd->zgd_bp)
|
|
||||||
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
|
||||||
|
|
||||||
kmem_free(zgd, sizeof (zgd_t));
|
kmem_free(zgd, sizeof (zgd_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user