DDT: Reduce global DDT lock scope during writes

Before this change DDT lock was taken 4 times per written block,
and as effectively a pool-wide lock it can be highly congested.
This change introduces a new per-entry dde_io_lock, protecting some
fields during I/O ready and done stages, so that we don't need the
global lock there.

According to my write tests on 64-thread system with 4KB blocks this
significantly reduce the global lock contention, reducing CPU usage
from 100% to expected ~80%, and increasing write throughput by 10%.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17960
This commit is contained in:
Alexander Motin 2025-12-01 13:44:10 -05:00 committed by Brian Behlendorf
parent a785ddc5f3
commit a41ef36858
3 changed files with 87 additions and 37 deletions

View File

@ -219,6 +219,9 @@ typedef enum {
* because its relatively rarely used.
*/
typedef struct {
/* protects dde_phys, dde_orig_phys and dde_lead_zio during I/O */
kmutex_t dde_io_lock;
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

View File

@ -1010,6 +1010,7 @@ ddt_alloc_entry_io(ddt_entry_t *dde)
return;
dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
}
static void
@ -1022,6 +1023,7 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
if (dde->dde_io->dde_repair_abd != NULL)
abd_free(dde->dde_io->dde_repair_abd);
mutex_destroy(&dde->dde_io->dde_io_lock);
kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
}

View File

@ -3683,7 +3683,7 @@ zio_ddt_child_write_done(zio_t *zio)
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
ddt_univ_phys_t *ddp = dde->dde_phys;
ddt_enter(ddt);
mutex_enter(&dde->dde_io->dde_io_lock);
/* we're the lead, so once we're done there's no one else outstanding */
if (dde->dde_io->dde_lead_zio[p] == zio)
@ -3700,21 +3700,24 @@ zio_ddt_child_write_done(zio_t *zio)
ddt_phys_unextend(ddp, orig, v);
ddt_phys_clear(orig, v);
mutex_exit(&dde->dde_io->dde_io_lock);
/*
* Undo the optimistic refcount increments that were done in
* zio_ddt_write() for all non-DDT-child parents. Since errors
* are rare, taking the global lock here is acceptable.
*/
ddt_enter(ddt);
zio_t *pio;
zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
ddt_phys_decref(ddp, v);
}
ddt_exit(ddt);
return;
}
/*
* Add references for all dedup writes that were waiting on the
* physical one, skipping any other physical writes that are waiting.
*/
zio_t *pio;
zl = NULL;
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
ddt_phys_addref(ddp, v);
}
/*
* We've successfully added new DVAs to the entry. Clear the saved
* state or, if there's still outstanding IO, remember it so we can
@ -3725,7 +3728,7 @@ zio_ddt_child_write_done(zio_t *zio)
else
ddt_phys_copy(orig, ddp, v);
ddt_exit(ddt);
mutex_exit(&dde->dde_io->dde_io_lock);
}
static void
@ -3753,7 +3756,7 @@ zio_ddt_child_write_ready(zio_t *zio)
if (zio->io_error != 0)
return;
ddt_enter(ddt);
mutex_enter(&dde->dde_io->dde_io_lock);
ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
@ -3764,7 +3767,7 @@ zio_ddt_child_write_ready(zio_t *zio)
ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
}
ddt_exit(ddt);
mutex_exit(&dde->dde_io->dde_io_lock);
}
static zio_t *
@ -3799,11 +3802,11 @@ zio_ddt_write(zio_t *zio)
dde = ddt_lookup(ddt, bp, B_FALSE);
if (dde == NULL) {
/* DDT size is over its quota so no new entries */
ddt_exit(ddt);
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
if (zio->io_bp_override == NULL)
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}
@ -3814,6 +3817,7 @@ zio_ddt_write(zio_t *zio)
* we can't resolve it, so just convert to an ordinary write.
* (And automatically e-mail a paper to Nature?)
*/
ddt_exit(ddt);
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP)) {
zp->zp_checksum = spa_dedup_checksum(spa);
@ -3826,7 +3830,6 @@ zio_ddt_write(zio_t *zio)
}
ASSERT(!BP_GET_DEDUP(bp));
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}
@ -3877,10 +3880,15 @@ zio_ddt_write(zio_t *zio)
uint8_t parent_dvas = 0;
/*
* What we do next depends on whether or not there's IO outstanding that
* will update this entry.
* What we do next depends on whether or not there's IO outstanding
* that will update this entry. If dde_io exists, we need to hold
* its lock to safely check and use dde_lead_zio.
*/
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
ddt_entry_io_t *dde_io = dde->dde_io;
if (dde_io != NULL)
mutex_enter(&dde_io->dde_io_lock);
if (dde_io == NULL || dde_io->dde_lead_zio[p] == NULL) {
/*
* No IO outstanding, so we only need to worry about ourselves.
*/
@ -3895,6 +3903,8 @@ zio_ddt_write(zio_t *zio)
* block and leave.
*/
if (have_dvas == 0) {
if (dde_io != NULL)
mutex_exit(&dde_io->dde_io_lock);
ASSERT(BP_GET_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_extend(ddp, v, bp);
@ -3923,6 +3933,9 @@ zio_ddt_write(zio_t *zio)
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
if (dde_io != NULL)
mutex_exit(&dde_io->dde_io_lock);
/*
* For rewrite operations, try preserving the original
* logical birth time. If the result matches the
@ -3934,8 +3947,8 @@ zio_ddt_write(zio_t *zio)
ddt_bp_fill(ddp, v, bp, orig_logical_birth);
if (BP_EQUAL(bp, &zio->io_bp_orig)) {
/* We can skip accounting. */
zio->io_flags |= ZIO_FLAG_NOPWRITE;
ddt_exit(ddt);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (zio);
}
}
@ -3997,7 +4010,16 @@ zio_ddt_write(zio_t *zio)
* missed out.
*/
ddt_bp_fill(ddp, v, bp, txg);
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
piggyback:
zio_add_child(zio, dde_io->dde_lead_zio[p]);
/*
* Optimistically increment refcount for this parent.
* If the write fails, zio_ddt_child_write_done() will
* decrement for all non-DDT-child parents.
*/
ddt_phys_addref(ddp, v);
mutex_exit(&dde_io->dde_io_lock);
ddt_exit(ddt);
return (zio);
}
@ -4023,11 +4045,8 @@ zio_ddt_write(zio_t *zio)
ASSERT(pio);
parent_dvas = pio->io_prop.zp_copies;
if (parent_dvas >= need_dvas) {
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
ddt_exit(ddt);
return (zio);
}
if (parent_dvas >= need_dvas)
goto piggyback;
/*
* Still not enough, so we will need to issue to get the
@ -4037,10 +4056,12 @@ zio_ddt_write(zio_t *zio)
}
if (is_ganged) {
if (dde_io != NULL)
mutex_exit(&dde_io->dde_io_lock);
ddt_exit(ddt);
zp->zp_dedup = B_FALSE;
BP_SET_DEDUP(bp, B_FALSE);
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
return (zio);
}
@ -4070,21 +4091,45 @@ zio_ddt_write(zio_t *zio)
* We are the new lead zio, because our parent has the highest
* zp_copies that has been requested for this entry so far.
*/
ddt_alloc_entry_io(dde);
if (dde->dde_io->dde_lead_zio[p] == NULL) {
if (dde_io == NULL) {
/*
* New dde_io. No lock needed since no other thread can have
* a reference yet.
*/
ddt_alloc_entry_io(dde);
dde_io = dde->dde_io;
/*
* First time out, take a copy of the stable entry to revert
* to if there's an error (see zio_ddt_child_write_done())
*/
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys, v);
dde_io->dde_lead_zio[p] = cio;
} else {
/*
* Make the existing chain our child, because it cannot
* complete until we have.
*/
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
if (dde_io->dde_lead_zio[p] == NULL) {
/*
* First time out, take a copy of the stable entry
* to revert to if there's an error (see
* zio_ddt_child_write_done())
*/
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys,
v);
} else {
/*
* Make the existing chain our child, because it
* cannot complete until we have.
*/
zio_add_child(cio, dde_io->dde_lead_zio[p]);
}
dde_io->dde_lead_zio[p] = cio;
mutex_exit(&dde_io->dde_io_lock);
}
dde->dde_io->dde_lead_zio[p] = cio;
/*
* Optimistically increment the refcount for this dedup write.
* If the write fails, zio_ddt_child_write_done() will decrement
* for all non-DDT-child parents.
*/
ddt_phys_addref(ddp, v);
ddt_exit(ddt);