ZIL: Reduce scope of per-dataset zl_issuer_lock.

Before this change ZIL copied all log data while holding the lock.
It caused huge lock contention on workloads with many big parallel
writes.  This change splits the process into two parts: first,
zil_lwb_assign() estimates the log space needed for all transactions,
and zil_lwb_write_close() allocates blocks and zios while holding the
lock, then, after the lock in dropped, zil_lwb_commit() copies the
data, and zil_lwb_write_issue() issues the I/Os.

Also while there slightly reduce scope of zl_lock.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #14841
This commit is contained in:
Alexander Motin 2023-05-25 12:48:43 -04:00 committed by GitHub
parent 79b20949b2
commit f63811f072
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 287 additions and 152 deletions

View File

@ -44,7 +44,7 @@ extern "C" {
* must be held. * must be held.
* *
* After the lwb is "opened", it can transition into the "issued" state * After the lwb is "opened", it can transition into the "issued" state
* via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
* be held when making this transition. * be held when making this transition.
* *
* After the lwb's write zio completes, it transitions into the "write * After the lwb's write zio completes, it transitions into the "write
@ -93,20 +93,23 @@ typedef struct lwb {
blkptr_t lwb_blk; /* on disk address of this log blk */ blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_nused; /* # used bytes in buffer */ int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */ int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */ lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */ char *lwb_buf; /* log write buffer */
zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_max_txg; /* highest txg in this lwb */ uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */
list_t lwb_itxs; /* list of itx's */ list_t lwb_itxs; /* list of itx's */
list_t lwb_waiters; /* list of zil_commit_waiter's */ list_t lwb_waiters; /* list of zil_commit_waiter's */
avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
} lwb_t; } lwb_t;
/* /*

View File

@ -146,6 +146,9 @@ static uint64_t zil_slog_bulk = 768 * 1024;
static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache; static kmem_cache_t *zil_zcw_cache;
static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
static itx_t *zil_itx_clone(itx_t *oitx);
static int static int
zil_bp_compare(const void *x1, const void *x2) zil_bp_compare(const void *x1, const void *x2)
{ {
@ -747,20 +750,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
lwb->lwb_blk = *bp; lwb->lwb_blk = *bp;
lwb->lwb_fastwrite = fastwrite; lwb->lwb_fastwrite = fastwrite;
lwb->lwb_slog = slog; lwb->lwb_slog = slog;
lwb->lwb_indirect = B_FALSE;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
lwb->lwb_nused = lwb->lwb_nfilled = 0;
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_state = LWB_STATE_CLOSED;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
lwb->lwb_max_txg = txg;
lwb->lwb_write_zio = NULL; lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL; lwb->lwb_root_zio = NULL;
lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_timestamp = 0;
lwb->lwb_issued_txg = 0; lwb->lwb_issued_txg = 0;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_max_txg = txg;
lwb->lwb_nused = sizeof (zil_chain_t);
lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
lwb->lwb_nused = 0;
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb); list_insert_tail(&zilog->zl_lwb_list, lwb);
@ -1397,6 +1401,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zilog->zl_commit_lr_seq = zilog->zl_lr_seq; zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
} }
mutex_exit(&zilog->zl_lock);
while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx); zil_itx_destroy(itx);
@ -1429,8 +1435,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
mutex_exit(&zcw->zcw_lock); mutex_exit(&zcw->zcw_lock);
} }
mutex_exit(&zilog->zl_lock);
mutex_enter(&zilog->zl_lwb_io_lock); mutex_enter(&zilog->zl_lwb_io_lock);
txg = lwb->lwb_issued_txg; txg = lwb->lwb_issued_txg;
ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
@ -1666,46 +1670,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], if (lwb->lwb_root_zio != NULL)
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, return;
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_root_zio = zio_root(zilog->zl_spa,
zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
if (lwb->lwb_root_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk)); BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1;
}
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE; prio = ZIO_PRIORITY_SYNC_WRITE;
else else
prio = ZIO_PRIORITY_ASYNC_WRITE; prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_root_zio = zio_root(zilog->zl_spa, SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
ASSERT3P(lwb->lwb_root_zio, !=, NULL); lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, mutex_enter(&zilog->zl_lock);
BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, if (!lwb->lwb_fastwrite) {
prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
ASSERT3P(lwb->lwb_write_zio, !=, NULL); lwb->lwb_fastwrite = 1;
}
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
&lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
lwb->lwb_state = LWB_STATE_OPENED; lwb->lwb_state = LWB_STATE_OPENED;
zil_lwb_set_zio_dependency(zilog, lwb); zil_lwb_set_zio_dependency(zilog, lwb);
zilog->zl_last_lwb_opened = lwb; zilog->zl_last_lwb_opened = lwb;
}
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
} }
/* /*
@ -1736,11 +1735,11 @@ static const struct {
static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
/* /*
* Start a log block write and advance to the next log block. * Close the log block for being issued and allocate the next one.
* Calls are serialized. * Has to be called under zl_issuer_lock to chain more lwbs.
*/ */
static lwb_t * static lwb_t *
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
{ {
lwb_t *nlwb = NULL; lwb_t *nlwb = NULL;
zil_chain_t *zilc; zil_chain_t *zilc;
@ -1748,7 +1747,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
blkptr_t *bp; blkptr_t *bp;
dmu_tx_t *tx; dmu_tx_t *tx;
uint64_t txg; uint64_t txg;
uint64_t zil_blksz, wsz; uint64_t zil_blksz;
int i, error; int i, error;
boolean_t slog; boolean_t slog;
@ -1757,16 +1756,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /*
zilc = (zil_chain_t *)lwb->lwb_buf; * If this lwb includes indirect writes, we have to commit before
bp = &zilc->zc_next_blk; * creating the transaction, otherwise we may end up in dead lock.
} else { */
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); if (lwb->lwb_indirect) {
bp = &zilc->zc_next_blk; for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
itx = list_next(&lwb->lwb_itxs, itx))
zil_lwb_commit(zilog, lwb, itx);
lwb->lwb_nused = lwb->lwb_nfilled;
} }
ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/* /*
* Allocate the next block and save its address in this block * Allocate the next block and save its address in this block
* before writing it in order to establish the log chain. * before writing it in order to establish the log chain.
@ -1816,17 +1816,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2)
zilc = (zil_chain_t *)lwb->lwb_buf;
else
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
bp = &zilc->zc_next_blk;
BP_ZERO(bp); BP_ZERO(bp);
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
if (slog) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
} else {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
lwb->lwb_nused);
}
if (error == 0) { if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg); ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@ -1838,17 +1834,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
} }
lwb->lwb_state = LWB_STATE_ISSUED;
dmu_tx_commit(tx);
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
}
/*
* Finalize previously closed block and issue the write zio.
* Does not require locking.
*/
static void
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
zil_chain_t *zilc;
int wsz;
/* Actually fill the lwb with the data if not yet. */
if (!lwb->lwb_indirect) {
for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
itx = list_next(&lwb->lwb_itxs, itx))
zil_lwb_commit(zilog, lwb, itx);
lwb->lwb_nused = lwb->lwb_nfilled;
}
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
/* For Slim ZIL only write what is used. */ /* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int);
ASSERT3U(wsz, <=, lwb->lwb_sz); ASSERT3S(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz); zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size; wsz = lwb->lwb_write_zio->io_size;
zilc = (zil_chain_t *)lwb->lwb_buf;
} else { } else {
wsz = lwb->lwb_sz; wsz = lwb->lwb_sz;
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
} }
zilc->zc_pad = 0; zilc->zc_pad = 0;
zilc->zc_nused = lwb->lwb_nused; zilc->zc_nused = lwb->lwb_nused;
zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
@ -1858,22 +1884,20 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
*/ */
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
if (lwb->lwb_slog) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
} else {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
lwb->lwb_nused);
}
spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
zil_lwb_add_block(lwb, &lwb->lwb_blk); zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime(); lwb->lwb_issued_timestamp = gethrtime();
lwb->lwb_state = LWB_STATE_ISSUED;
zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_root_zio);
zio_nowait(lwb->lwb_write_zio); zio_nowait(lwb->lwb_write_zio);
dmu_tx_commit(tx);
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
} }
/* /*
@ -1909,13 +1933,19 @@ zil_max_copied_data(zilog_t *zilog)
sizeof (lr_write_t)); sizeof (lr_write_t));
} }
/*
* Estimate space needed in the lwb for the itx. Allocate more lwbs or
* split the itx as needed, but don't touch the actual transaction data.
* Has to be called under zl_issuer_lock to call zil_lwb_write_close()
* to chain more lwbs.
*/
static lwb_t * static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
{ {
lr_t *lrcb, *lrc; itx_t *citx;
lr_write_t *lrwb, *lrw; lr_t *lr, *clr;
char *lr_buf; lr_write_t *lrw;
uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb, !=, NULL);
@ -1923,8 +1953,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
zil_lwb_write_open(zilog, lwb); zil_lwb_write_open(zilog, lwb);
lrc = &itx->itx_lr; lr = &itx->itx_lr;
lrw = (lr_write_t *)lrc; lrw = (lr_write_t *)lr;
/* /*
* A commit itx doesn't represent any on-disk state; instead * A commit itx doesn't represent any on-disk state; instead
@ -1938,24 +1968,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
* *
* For more details, see the comment above zil_commit(). * For more details, see the comment above zil_commit().
*/ */
if (lrc->lrc_txtype == TX_COMMIT) { if (lr->lrc_txtype == TX_COMMIT) {
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
zil_commit_waiter_link_lwb(itx->itx_private, lwb); zil_commit_waiter_link_lwb(itx->itx_private, lwb);
itx->itx_private = NULL; itx->itx_private = NULL;
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
list_insert_tail(&lwb->lwb_itxs, itx);
return (lwb); return (lwb);
} }
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED( dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t); lrw->lr_length, sizeof (uint64_t), uint64_t);
dpad = dlen - lrw->lr_length;
} else { } else {
dlen = dpad = 0; dlen = 0;
} }
reclen = lrc->lrc_reclen; reclen = lr->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen); zilog->zl_cur_used += (reclen + dlen);
txg = lrc->lrc_txg;
cont: cont:
/* /*
@ -1968,7 +1997,8 @@ cont:
lwb_sp < zil_max_waste_space(zilog) && lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 || (dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) { lwb_sp < reclen + dlen % max_log_data))) {
lwb = zil_lwb_write_issue(zilog, lwb); list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL) if (lwb == NULL)
return (NULL); return (NULL);
zil_lwb_write_open(zilog, lwb); zil_lwb_write_open(zilog, lwb);
@ -1987,19 +2017,99 @@ cont:
} }
dnow = MIN(dlen, lwb_sp - reclen); dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused; if (dlen > dnow) {
memcpy(lr_buf, lrc, reclen); ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ citx = zil_itx_clone(itx);
clr = &citx->itx_lr;
lr_write_t *clrw = (lr_write_t *)clr;
clrw->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
} else {
citx = itx;
clr = lr;
}
/*
* We're actually making an entry, so update lrc_seq to be the
* log record sequence number. Note that this is generally not
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
clr->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
zil_lwb_add_txg(lwb, lr->lrc_txg);
list_insert_tail(&lwb->lwb_itxs, citx);
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
goto cont;
}
/*
* We have to really issue all queued LWBs before we may have to
* wait for a txg sync. Otherwise we may end up in a dead lock.
*/
if (lr->lrc_txtype == TX_WRITE) {
boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa);
if (frozen || itx->itx_wr_state == WR_INDIRECT) {
lwb_t *tlwb;
while ((tlwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, tlwb);
}
if (itx->itx_wr_state == WR_INDIRECT)
lwb->lwb_indirect = B_TRUE;
if (frozen)
txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
}
return (lwb);
}
/*
* Fill the actual transaction data into the lwb, following zil_lwb_assign().
* Does not require locking.
*/
static void
zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
{
lr_t *lr, *lrb;
lr_write_t *lrw, *lrwb;
char *lr_buf;
uint64_t dlen, reclen;
lr = &itx->itx_lr;
lrw = (lr_write_t *)lr;
if (lr->lrc_txtype == TX_COMMIT)
return;
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lr->lrc_reclen;
ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
memcpy(lr_buf, lr, reclen);
lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */
lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */
ZIL_STAT_BUMP(zilog, zil_itx_count); ZIL_STAT_BUMP(zilog, zil_itx_count);
/* /*
* If it's a write, fetch the data or get its blkptr as appropriate. * If it's a write, fetch the data or get its blkptr as appropriate.
*/ */
if (lrc->lrc_txtype == TX_WRITE) { if (lr->lrc_txtype == TX_WRITE) {
if (txg > spa_freeze_txg(zilog->zl_spa))
txg_wait_synced(zilog->zl_dmu_pool, txg);
if (itx->itx_wr_state == WR_COPIED) { if (itx->itx_wr_state == WR_COPIED) {
ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
@ -2010,14 +2120,10 @@ cont:
if (itx->itx_wr_state == WR_NEED_COPY) { if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen; dbuf = lr_buf + reclen;
lrcb->lrc_reclen += dnow; lrb->lrc_reclen += dlen;
if (lrwb->lr_length > dnow)
lrwb->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
dnow); dlen);
} else { } else {
ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
dbuf = NULL; dbuf = NULL;
@ -2044,9 +2150,11 @@ cont:
error = zilog->zl_get_data(itx->itx_private, error = zilog->zl_get_data(itx->itx_private,
itx->itx_gen, lrwb, dbuf, lwb, itx->itx_gen, lrwb, dbuf, lwb,
lwb->lwb_write_zio); lwb->lwb_write_zio);
if (dbuf != NULL && error == 0 && dnow == dlen) if (dbuf != NULL && error == 0) {
/* Zero any padding bytes in the last block. */ /* Zero any padding bytes in the last block. */
memset((char *)dbuf + lrwb->lr_length, 0, dpad); memset((char *)dbuf + lrwb->lr_length, 0,
dlen - lrwb->lr_length);
}
/* /*
* Typically, the only return values we should see from * Typically, the only return values we should see from
@ -2074,39 +2182,26 @@ cont:
error); error);
zfs_fallthrough; zfs_fallthrough;
case EIO: case EIO:
txg_wait_synced(zilog->zl_dmu_pool, txg); if (lwb->lwb_indirect) {
txg_wait_synced(zilog->zl_dmu_pool,
lr->lrc_txg);
} else {
lwb->lwb_write_zio->io_error = error;
}
zfs_fallthrough; zfs_fallthrough;
case ENOENT: case ENOENT:
zfs_fallthrough; zfs_fallthrough;
case EEXIST: case EEXIST:
zfs_fallthrough; zfs_fallthrough;
case EALREADY: case EALREADY:
return (lwb); return;
} }
} }
} }
/* lwb->lwb_nfilled += reclen + dlen;
* We're actually making an entry, so update lrc_seq to be the ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
* log record sequence number. Note that this is generally not ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
lrcb->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
zil_lwb_add_txg(lwb, txg);
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
goto cont;
}
return (lwb);
} }
itx_t * itx_t *
@ -2131,6 +2226,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
return (itx); return (itx);
} }
static itx_t *
zil_itx_clone(itx_t *oitx)
{
itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
memcpy(itx, oitx, oitx->itx_size);
itx->itx_callback = NULL;
itx->itx_callback_data = NULL;
return (itx);
}
void void
zil_itx_destroy(itx_t *itx) zil_itx_destroy(itx_t *itx)
{ {
@ -2162,7 +2267,7 @@ zil_itxg_clean(void *arg)
/* /*
* In the general case, commit itxs will not be found * In the general case, commit itxs will not be found
* here, as they'll be committed to an lwb via * here, as they'll be committed to an lwb via
* zil_lwb_commit(), and free'd in that function. Having * zil_lwb_assign(), and free'd in that function. Having
* said that, it is still possible for commit itxs to be * said that, it is still possible for commit itxs to be
* found here, due to the following race: * found here, due to the following race:
* *
@ -2561,7 +2666,7 @@ zil_commit_writer_stall(zilog_t *zilog)
* lwb will be issued to the zio layer to be written to disk. * lwb will be issued to the zio layer to be written to disk.
*/ */
static void static void
zil_process_commit_list(zilog_t *zilog) zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
{ {
spa_t *spa = zilog->zl_spa; spa_t *spa = zilog->zl_spa;
list_t nolwb_itxs; list_t nolwb_itxs;
@ -2663,18 +2768,23 @@ zil_process_commit_list(zilog_t *zilog)
*/ */
if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
if (lwb != NULL) { if (lwb != NULL) {
lwb = zil_lwb_commit(zilog, itx, lwb); lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
if (lwb == NULL) {
if (lwb == NULL)
list_insert_tail(&nolwb_itxs, itx); list_insert_tail(&nolwb_itxs, itx);
else } else if ((zcw->zcw_lwb != NULL &&
list_insert_tail(&lwb->lwb_itxs, itx); zcw->zcw_lwb != lwb) || zcw->zcw_done) {
/*
* Our lwb is done, leave the rest of
* itx list to somebody else who care.
*/
first = B_FALSE;
break;
}
} else { } else {
if (lrc->lrc_txtype == TX_COMMIT) { if (lrc->lrc_txtype == TX_COMMIT) {
zil_commit_waiter_link_nolwb( zil_commit_waiter_link_nolwb(
itx->itx_private, &nolwb_waiters); itx->itx_private, &nolwb_waiters);
} }
list_insert_tail(&nolwb_itxs, itx); list_insert_tail(&nolwb_itxs, itx);
} }
} else { } else {
@ -2690,6 +2800,8 @@ zil_process_commit_list(zilog_t *zilog)
* the ZIL write pipeline; see the comment within * the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details. * zil_commit_writer_stall() for more details.
*/ */
while ((lwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog); zil_commit_writer_stall(zilog);
/* /*
@ -2735,13 +2847,13 @@ zil_process_commit_list(zilog_t *zilog)
* on the system, such that this function will be * on the system, such that this function will be
* immediately called again (not necessarily by the same * immediately called again (not necessarily by the same
* thread) and this lwb's zio will be issued via * thread) and this lwb's zio will be issued via
* zil_lwb_commit(). This way, the lwb is guaranteed to * zil_lwb_assign(). This way, the lwb is guaranteed to
* be "full" when it is issued to disk, and we'll make * be "full" when it is issued to disk, and we'll make
* use of the lwb's size the best we can. * use of the lwb's size the best we can.
* *
* 2. If there isn't sufficient ZIL activity occurring on * 2. If there isn't sufficient ZIL activity occurring on
* the system, such that this lwb's zio isn't issued via * the system, such that this lwb's zio isn't issued via
* zil_lwb_commit(), zil_commit_waiter() will issue the * zil_lwb_assign(), zil_commit_waiter() will issue the
* lwb's zio. If this occurs, the lwb is not guaranteed * lwb's zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means * to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount * the size of the lwb was "too large" given the amount
@ -2773,13 +2885,18 @@ zil_process_commit_list(zilog_t *zilog)
zfs_commit_timeout_pct / 100; zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout || if (sleep < zil_min_commit_timeout ||
lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
lwb = zil_lwb_write_issue(zilog, lwb); list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb);
zilog->zl_cur_used = 0; zilog->zl_cur_used = 0;
if (lwb == NULL) if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs))
!= NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog); zil_commit_writer_stall(zilog);
} }
} }
} }
}
} }
/* /*
@ -2799,9 +2916,13 @@ zil_process_commit_list(zilog_t *zilog)
static void static void
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{ {
list_t ilwbs;
lwb_t *lwb;
ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa)); ASSERT(spa_writeable(zilog->zl_spa));
list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zilog->zl_issuer_lock);
if (zcw->zcw_lwb != NULL || zcw->zcw_done) { if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
@ -2828,10 +2949,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
zil_get_commit_list(zilog); zil_get_commit_list(zilog);
zil_prune_commit_list(zilog); zil_prune_commit_list(zilog);
zil_process_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs);
out: out:
mutex_exit(&zilog->zl_issuer_lock); mutex_exit(&zilog->zl_issuer_lock);
while ((lwb = list_remove_head(&ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
list_destroy(&ilwbs);
} }
static void static void
@ -2858,7 +2982,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
return; return;
/* /*
* In order to call zil_lwb_write_issue() we must hold the * In order to call zil_lwb_write_close() we must hold the
* zilog's "zl_issuer_lock". We can't simply acquire that lock, * zilog's "zl_issuer_lock". We can't simply acquire that lock,
* since we're already holding the commit waiter's "zcw_lock", * since we're already holding the commit waiter's "zcw_lock",
* and those two locks are acquired in the opposite order * and those two locks are acquired in the opposite order
@ -2876,8 +3000,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* the waiter is marked "done"), so without this check we could * the waiter is marked "done"), so without this check we could
* wind up with a use-after-free error below. * wind up with a use-after-free error below.
*/ */
if (zcw->zcw_done) if (zcw->zcw_done) {
lwb = NULL;
goto out; goto out;
}
ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3P(lwb, ==, zcw->zcw_lwb);
@ -2896,15 +3022,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* if it's ISSUED or OPENED, and block any other threads that might * if it's ISSUED or OPENED, and block any other threads that might
* attempt to issue this lwb. For that reason we hold the * attempt to issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call * zl_issuer_lock when checking the lwb_state; we must not call
* zil_lwb_write_issue() if the lwb had already been issued. * zil_lwb_write_close() if the lwb had already been issued.
* *
* See the comment above the lwb_state_t structure definition for * See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements. * more details on the lwb states, and locking requirements.
*/ */
if (lwb->lwb_state == LWB_STATE_ISSUED || if (lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_WRITE_DONE ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE) lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
lwb = NULL;
goto out; goto out;
}
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@ -2914,7 +3042,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* since we've reached the commit waiter's timeout and it still * since we've reached the commit waiter's timeout and it still
* hasn't been issued. * hasn't been issued.
*/ */
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
@ -2934,7 +3062,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
if (nlwb == NULL) { if (nlwb == NULL) {
/* /*
* When zil_lwb_write_issue() returns NULL, this * When zil_lwb_write_close() returns NULL, this
* indicates zio_alloc_zil() failed to allocate the * indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this occurs, the ZIL write * "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the * pipeline must be stalled; see the comment within the
@ -2956,12 +3084,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* lock, which occurs prior to calling dmu_tx_commit() * lock, which occurs prior to calling dmu_tx_commit()
*/ */
mutex_exit(&zcw->zcw_lock); mutex_exit(&zcw->zcw_lock);
zil_lwb_write_issue(zilog, lwb);
lwb = NULL;
zil_commit_writer_stall(zilog); zil_commit_writer_stall(zilog);
mutex_enter(&zcw->zcw_lock); mutex_enter(&zcw->zcw_lock);
} }
out: out:
mutex_exit(&zilog->zl_issuer_lock); mutex_exit(&zilog->zl_issuer_lock);
if (lwb)
zil_lwb_write_issue(zilog, lwb);
ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock));
} }
@ -2976,7 +3108,7 @@ out:
* waited "long enough" and the lwb is still in the "open" state. * waited "long enough" and the lwb is still in the "open" state.
* *
* Given a sufficient amount of itxs being generated and written using * Given a sufficient amount of itxs being generated and written using
* the ZIL, the lwb's zio will be issued via the zil_lwb_commit() * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
* function. If this does not occur, this secondary responsibility will * function. If this does not occur, this secondary responsibility will
* ensure the lwb is issued even if there is not other synchronous * ensure the lwb is issued even if there is not other synchronous
* activity on the system. * activity on the system.
@ -3656,7 +3788,7 @@ zil_close(zilog_t *zilog)
/* /*
* zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
* on the time when the dmu_tx transaction is assigned in * on the time when the dmu_tx transaction is assigned in
* zil_lwb_write_issue(). * zil_lwb_write_close().
*/ */
mutex_enter(&zilog->zl_lwb_io_lock); mutex_enter(&zilog->zl_lwb_io_lock);
txg = MAX(zilog->zl_lwb_max_issued_txg, txg); txg = MAX(zilog->zl_lwb_max_issued_txg, txg);