DDT: Move logs searches out of the lock

Postponing entry removal from the DDT log in case of hit till later
single-threaded sync stage allows to make ddl_tree stable during
multi-threaded ZIO processing stage.  It allows to drop the DDT lock
before the search instead of after, reducing the contention a lot.

Actually ddt_log_update_entry() was already handling the case of
entry present in the active log, so we only need to remove it from
flushing log, if the entry happen to be there.

My tests with parallel 4KB block writes show throughput increase
from 480MB/s (122K blocks/s) to 827MB/s (212K blocks/s), even
though still limited by the global DDT lock contention.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #18044
This commit is contained in:
Alexander Motin
2025-12-15 12:17:04 -05:00
committed by GitHub
parent 3d76ba2737
commit 46d6f1fe56
5 changed files with 88 additions and 61 deletions
+50 -39
View File
@@ -1037,13 +1037,6 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
{
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
/* Entry is still in the log, so charge the entry back to it */
if (dde->dde_flags & DDE_FLAG_LOGGED) {
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
}
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
}
@@ -1234,63 +1227,61 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
/* Time to make a new entry. */
dde = ddt_alloc(ddt, &search);
/* Record the time this class was created (used by ddt prune) */
if (ddt->ddt_flags & DDT_FLAG_FLAT)
dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
avl_insert(&ddt->ddt_tree, dde, where);
/* If its in the log tree, we can "load" it from there */
/*
* The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
* threads will wait even while we drop the lock.
*/
ddt_exit(ddt);
/*
* If there is a log, we should try to "load" from there first.
*/
if (ddt->ddt_flags & DDT_FLAG_LOG) {
ddt_lightweight_entry_t ddlwe;
boolean_t from_flushing;
if (ddt_log_find_key(ddt, &search, &ddlwe)) {
/*
* See if we have the key first, and if so, set up
* the entry.
*/
/* Read-only search, no locks needed (logs stable during I/O) */
if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
dde->dde_type = ddlwe.ddlwe_type;
dde->dde_class = ddlwe.ddlwe_class;
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
DDT_PHYS_SIZE(ddt));
/* Whatever we found isn't valid for this BP, eject */
if (verify &&
!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
/*
* Check validity. If invalid and no waiters, clean up
* immediately. Otherwise continue setup for waiters.
*/
boolean_t valid = !verify ||
ddt_entry_lookup_is_valid(ddt, bp, dde);
ddt_enter(ddt);
if (!valid && dde->dde_waiters == 0) {
avl_remove(&ddt->ddt_tree, dde);
ddt_free(ddt, dde);
return (NULL);
}
/* Remove it and count it */
if (ddt_log_remove_key(ddt,
ddt->ddt_log_active, &search)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
} else {
VERIFY(ddt_log_remove_key(ddt,
ddt->ddt_log_flushing, &search));
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
if (from_flushing) {
dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
DDT_KSTAT_BUMP(ddt,
dds_lookup_log_flushing_hit);
} else {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
}
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
return (dde);
cv_broadcast(&dde->dde_cv);
return (valid ? dde : NULL);
}
DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
}
/*
* ddt_tree is now stable, so unlock and let everyone else keep moving.
* Anyone landing on this entry will find it without DDE_FLAG_LOADED,
* and go to sleep waiting for it above.
*/
ddt_exit(ddt);
/* Search all store objects for the entry. */
error = ENOENT;
for (type = 0; type < DDT_TYPES; type++) {
@@ -2354,6 +2345,19 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
/* If from flushing log, remove it. */
if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
VERIFY(ddt_log_remove_key(ddt,
ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
}
/* Update class_start to track last modification time */
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
ddt_class_start();
}
ddt_log_entry(ddt, &ddlwe, &dlu);
ddt_sync_scan_entry(ddt, &ddlwe, tx);
ddt_free(ddt, dde);
@@ -2414,6 +2418,13 @@ ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
/* Update class_start to track last modification time */
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
ddt_class_start();
}
ddt_sync_flush_entry(ddt, &ddlwe,
dde->dde_type, dde->dde_class, tx);
ddt_sync_scan_entry(ddt, &ddlwe, tx);
@@ -2765,7 +2776,7 @@ ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
* If this entry is on the log, then the stored entry is stale
* and we should skip it.
*/
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
continue;
/* prune older entries */
+22 -10
View File
@@ -252,7 +252,8 @@ ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
}
static void
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
boolean_t hist)
{
/* Create the log tree entry from a live or stored entry */
avl_index_t where;
@@ -262,7 +263,13 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
ddle = ddt_log_alloc_entry(ddt);
ddle->ddle_key = ddlwe->ddlwe_key;
avl_insert(&ddl->ddl_tree, ddle, where);
} else if (hist) {
ddt_lightweight_entry_t oddlwe;
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
}
if (hist)
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
ddle->ddle_type = ddlwe->ddlwe_type;
ddle->ddle_class = ddlwe->ddlwe_class;
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
@@ -273,8 +280,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
{
ASSERT3U(dlu->dlu_dbp, !=, NULL);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
/* Get our block */
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
@@ -381,14 +387,20 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
boolean_t
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe)
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
{
ddt_log_entry_t *ddle =
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
if (!ddle)
ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
ddk, NULL);
if (ddle) {
if (from_flushing)
*from_flushing = B_FALSE;
} else {
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
if (!ddle)
return (B_FALSE);
if (!ddle)
return (B_FALSE);
if (from_flushing)
*from_flushing = B_TRUE;
}
if (ddlwe)
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
return (B_TRUE);
@@ -524,7 +536,7 @@ ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
ddlwe.ddlwe_key = dlre->dlre_key;
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
ddt_log_update_entry(ddt, ddl, &ddlwe);
ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
}
static void
+7 -5
View File
@@ -4067,19 +4067,21 @@ piggyback:
/*
* We need to write. We will create a new write with the copies
* property adjusted to match the number of DVAs we need to need to
* grow the DDT entry by to satisfy the request.
* property adjusted to match the number of DVAs we need to grow
* the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
zio_prop_t czp;
if (have_dvas > 0 || parent_dvas > 0) {
czp = *zp;
czp.zp_copies = need_dvas;
czp.zp_gang_copies = 0;
zp = &czp;
} else {
ASSERT3U(czp.zp_copies, ==, need_dvas);
ASSERT3U(zp->zp_copies, ==, need_dvas);
}
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);