mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-01-14 01:02:04 +03:00
DDT: Reduce global DDT lock scope during writes
Before this change DDT lock was taken 4 times per written block, and as effectively a pool-wide lock it can be highly congested. This change introduces a new per-entry dde_io_lock, protecting some fields during I/O ready and done stages, so that we don't need the global lock there. According to my write tests on 64-thread system with 4KB blocks this significantly reduce the global lock contention, reducing CPU usage from 100% to expected ~80%, and increasing write throughput by 10%. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rob Norris <robn@despairlabs.com> Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com> Closes #17960
This commit is contained in:
parent
a785ddc5f3
commit
a41ef36858
@ -219,6 +219,9 @@ typedef enum {
|
||||
* because its relatively rarely used.
|
||||
*/
|
||||
typedef struct {
|
||||
/* protects dde_phys, dde_orig_phys and dde_lead_zio during I/O */
|
||||
kmutex_t dde_io_lock;
|
||||
|
||||
/* copy of data after a repair read, to be rewritten */
|
||||
abd_t *dde_repair_abd;
|
||||
|
||||
|
||||
@ -1010,6 +1010,7 @@ ddt_alloc_entry_io(ddt_entry_t *dde)
|
||||
return;
|
||||
|
||||
dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
|
||||
mutex_init(&dde->dde_io->dde_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -1022,6 +1023,7 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
|
||||
if (dde->dde_io->dde_repair_abd != NULL)
|
||||
abd_free(dde->dde_io->dde_repair_abd);
|
||||
|
||||
mutex_destroy(&dde->dde_io->dde_io_lock);
|
||||
kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
|
||||
}
|
||||
|
||||
|
||||
119
module/zfs/zio.c
119
module/zfs/zio.c
@ -3683,7 +3683,7 @@ zio_ddt_child_write_done(zio_t *zio)
|
||||
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
||||
ddt_univ_phys_t *ddp = dde->dde_phys;
|
||||
|
||||
ddt_enter(ddt);
|
||||
mutex_enter(&dde->dde_io->dde_io_lock);
|
||||
|
||||
/* we're the lead, so once we're done there's no one else outstanding */
|
||||
if (dde->dde_io->dde_lead_zio[p] == zio)
|
||||
@ -3700,21 +3700,24 @@ zio_ddt_child_write_done(zio_t *zio)
|
||||
ddt_phys_unextend(ddp, orig, v);
|
||||
ddt_phys_clear(orig, v);
|
||||
|
||||
mutex_exit(&dde->dde_io->dde_io_lock);
|
||||
|
||||
/*
|
||||
* Undo the optimistic refcount increments that were done in
|
||||
* zio_ddt_write() for all non-DDT-child parents. Since errors
|
||||
* are rare, taking the global lock here is acceptable.
|
||||
*/
|
||||
ddt_enter(ddt);
|
||||
zio_t *pio;
|
||||
zl = NULL;
|
||||
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
|
||||
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
|
||||
ddt_phys_decref(ddp, v);
|
||||
}
|
||||
ddt_exit(ddt);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add references for all dedup writes that were waiting on the
|
||||
* physical one, skipping any other physical writes that are waiting.
|
||||
*/
|
||||
zio_t *pio;
|
||||
zl = NULL;
|
||||
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
|
||||
if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
|
||||
ddt_phys_addref(ddp, v);
|
||||
}
|
||||
|
||||
/*
|
||||
* We've successfully added new DVAs to the entry. Clear the saved
|
||||
* state or, if there's still outstanding IO, remember it so we can
|
||||
@ -3725,7 +3728,7 @@ zio_ddt_child_write_done(zio_t *zio)
|
||||
else
|
||||
ddt_phys_copy(orig, ddp, v);
|
||||
|
||||
ddt_exit(ddt);
|
||||
mutex_exit(&dde->dde_io->dde_io_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3753,7 +3756,7 @@ zio_ddt_child_write_ready(zio_t *zio)
|
||||
if (zio->io_error != 0)
|
||||
return;
|
||||
|
||||
ddt_enter(ddt);
|
||||
mutex_enter(&dde->dde_io->dde_io_lock);
|
||||
|
||||
ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
|
||||
|
||||
@ -3764,7 +3767,7 @@ zio_ddt_child_write_ready(zio_t *zio)
|
||||
ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
|
||||
}
|
||||
|
||||
ddt_exit(ddt);
|
||||
mutex_exit(&dde->dde_io->dde_io_lock);
|
||||
}
|
||||
|
||||
static zio_t *
|
||||
@ -3799,11 +3802,11 @@ zio_ddt_write(zio_t *zio)
|
||||
dde = ddt_lookup(ddt, bp, B_FALSE);
|
||||
if (dde == NULL) {
|
||||
/* DDT size is over its quota so no new entries */
|
||||
ddt_exit(ddt);
|
||||
zp->zp_dedup = B_FALSE;
|
||||
BP_SET_DEDUP(bp, B_FALSE);
|
||||
if (zio->io_bp_override == NULL)
|
||||
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
@ -3814,6 +3817,7 @@ zio_ddt_write(zio_t *zio)
|
||||
* we can't resolve it, so just convert to an ordinary write.
|
||||
* (And automatically e-mail a paper to Nature?)
|
||||
*/
|
||||
ddt_exit(ddt);
|
||||
if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
|
||||
ZCHECKSUM_FLAG_DEDUP)) {
|
||||
zp->zp_checksum = spa_dedup_checksum(spa);
|
||||
@ -3826,7 +3830,6 @@ zio_ddt_write(zio_t *zio)
|
||||
}
|
||||
ASSERT(!BP_GET_DEDUP(bp));
|
||||
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
@ -3877,10 +3880,15 @@ zio_ddt_write(zio_t *zio)
|
||||
uint8_t parent_dvas = 0;
|
||||
|
||||
/*
|
||||
* What we do next depends on whether or not there's IO outstanding that
|
||||
* will update this entry.
|
||||
* What we do next depends on whether or not there's IO outstanding
|
||||
* that will update this entry. If dde_io exists, we need to hold
|
||||
* its lock to safely check and use dde_lead_zio.
|
||||
*/
|
||||
if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
|
||||
ddt_entry_io_t *dde_io = dde->dde_io;
|
||||
if (dde_io != NULL)
|
||||
mutex_enter(&dde_io->dde_io_lock);
|
||||
|
||||
if (dde_io == NULL || dde_io->dde_lead_zio[p] == NULL) {
|
||||
/*
|
||||
* No IO outstanding, so we only need to worry about ourselves.
|
||||
*/
|
||||
@ -3895,6 +3903,8 @@ zio_ddt_write(zio_t *zio)
|
||||
* block and leave.
|
||||
*/
|
||||
if (have_dvas == 0) {
|
||||
if (dde_io != NULL)
|
||||
mutex_exit(&dde_io->dde_io_lock);
|
||||
ASSERT(BP_GET_BIRTH(bp) == txg);
|
||||
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
|
||||
ddt_phys_extend(ddp, v, bp);
|
||||
@ -3923,6 +3933,9 @@ zio_ddt_write(zio_t *zio)
|
||||
* then we can just use them as-is.
|
||||
*/
|
||||
if (have_dvas >= need_dvas) {
|
||||
if (dde_io != NULL)
|
||||
mutex_exit(&dde_io->dde_io_lock);
|
||||
|
||||
/*
|
||||
* For rewrite operations, try preserving the original
|
||||
* logical birth time. If the result matches the
|
||||
@ -3934,8 +3947,8 @@ zio_ddt_write(zio_t *zio)
|
||||
ddt_bp_fill(ddp, v, bp, orig_logical_birth);
|
||||
if (BP_EQUAL(bp, &zio->io_bp_orig)) {
|
||||
/* We can skip accounting. */
|
||||
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
||||
ddt_exit(ddt);
|
||||
zio->io_flags |= ZIO_FLAG_NOPWRITE;
|
||||
return (zio);
|
||||
}
|
||||
}
|
||||
@ -3997,7 +4010,16 @@ zio_ddt_write(zio_t *zio)
|
||||
* missed out.
|
||||
*/
|
||||
ddt_bp_fill(ddp, v, bp, txg);
|
||||
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
||||
piggyback:
|
||||
zio_add_child(zio, dde_io->dde_lead_zio[p]);
|
||||
|
||||
/*
|
||||
* Optimistically increment refcount for this parent.
|
||||
* If the write fails, zio_ddt_child_write_done() will
|
||||
* decrement for all non-DDT-child parents.
|
||||
*/
|
||||
ddt_phys_addref(ddp, v);
|
||||
mutex_exit(&dde_io->dde_io_lock);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
@ -4023,11 +4045,8 @@ zio_ddt_write(zio_t *zio)
|
||||
ASSERT(pio);
|
||||
parent_dvas = pio->io_prop.zp_copies;
|
||||
|
||||
if (parent_dvas >= need_dvas) {
|
||||
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
if (parent_dvas >= need_dvas)
|
||||
goto piggyback;
|
||||
|
||||
/*
|
||||
* Still not enough, so we will need to issue to get the
|
||||
@ -4037,10 +4056,12 @@ zio_ddt_write(zio_t *zio)
|
||||
}
|
||||
|
||||
if (is_ganged) {
|
||||
if (dde_io != NULL)
|
||||
mutex_exit(&dde_io->dde_io_lock);
|
||||
ddt_exit(ddt);
|
||||
zp->zp_dedup = B_FALSE;
|
||||
BP_SET_DEDUP(bp, B_FALSE);
|
||||
zio->io_pipeline = ZIO_WRITE_PIPELINE;
|
||||
ddt_exit(ddt);
|
||||
return (zio);
|
||||
}
|
||||
|
||||
@ -4070,21 +4091,45 @@ zio_ddt_write(zio_t *zio)
|
||||
* We are the new lead zio, because our parent has the highest
|
||||
* zp_copies that has been requested for this entry so far.
|
||||
*/
|
||||
ddt_alloc_entry_io(dde);
|
||||
if (dde->dde_io->dde_lead_zio[p] == NULL) {
|
||||
if (dde_io == NULL) {
|
||||
/*
|
||||
* New dde_io. No lock needed since no other thread can have
|
||||
* a reference yet.
|
||||
*/
|
||||
ddt_alloc_entry_io(dde);
|
||||
dde_io = dde->dde_io;
|
||||
/*
|
||||
* First time out, take a copy of the stable entry to revert
|
||||
* to if there's an error (see zio_ddt_child_write_done())
|
||||
*/
|
||||
ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
|
||||
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys, v);
|
||||
dde_io->dde_lead_zio[p] = cio;
|
||||
} else {
|
||||
/*
|
||||
* Make the existing chain our child, because it cannot
|
||||
* complete until we have.
|
||||
*/
|
||||
zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
|
||||
if (dde_io->dde_lead_zio[p] == NULL) {
|
||||
/*
|
||||
* First time out, take a copy of the stable entry
|
||||
* to revert to if there's an error (see
|
||||
* zio_ddt_child_write_done())
|
||||
*/
|
||||
ddt_phys_copy(&dde_io->dde_orig_phys, dde->dde_phys,
|
||||
v);
|
||||
} else {
|
||||
/*
|
||||
* Make the existing chain our child, because it
|
||||
* cannot complete until we have.
|
||||
*/
|
||||
zio_add_child(cio, dde_io->dde_lead_zio[p]);
|
||||
}
|
||||
dde_io->dde_lead_zio[p] = cio;
|
||||
mutex_exit(&dde_io->dde_io_lock);
|
||||
}
|
||||
dde->dde_io->dde_lead_zio[p] = cio;
|
||||
|
||||
/*
|
||||
* Optimistically increment the refcount for this dedup write.
|
||||
* If the write fails, zio_ddt_child_write_done() will decrement
|
||||
* for all non-DDT-child parents.
|
||||
*/
|
||||
ddt_phys_addref(ddp, v);
|
||||
|
||||
ddt_exit(ddt);
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user