mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 02:44:41 +03:00
DDT: Move logs searches out of the lock
Postponing entry removal from the DDT log in case of hit till later single-threaded sync stage allows to make ddl_tree stable during multi-threaded ZIO processing stage. It allows to drop the DDT lock before the search instead of after, reducing the contention a lot. Actually ddt_log_update_entry() was already handling the case of entry present in the active log, so we only need to remove it from flushing log, if the entry happen to be there. My tests with parallel 4KB block writes show throughput increase from 480MB/s (122K blocks/s) to 827MB/s (212K blocks/s), even though still limited by the global DDT lock contention. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com> Closes #18044
This commit is contained in:
+50
-39
@@ -1037,13 +1037,6 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
|
||||
|
||||
/* Entry is still in the log, so charge the entry back to it */
|
||||
if (dde->dde_flags & DDE_FLAG_LOGGED) {
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
|
||||
}
|
||||
|
||||
avl_remove(&ddt->ddt_tree, dde);
|
||||
ddt_free(ddt, dde);
|
||||
}
|
||||
@@ -1234,63 +1227,61 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
|
||||
|
||||
/* Time to make a new entry. */
|
||||
dde = ddt_alloc(ddt, &search);
|
||||
|
||||
/* Record the time this class was created (used by ddt prune) */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT)
|
||||
dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
|
||||
|
||||
avl_insert(&ddt->ddt_tree, dde, where);
|
||||
|
||||
/* If its in the log tree, we can "load" it from there */
|
||||
/*
|
||||
* The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
|
||||
* threads will wait even while we drop the lock.
|
||||
*/
|
||||
ddt_exit(ddt);
|
||||
|
||||
/*
|
||||
* If there is a log, we should try to "load" from there first.
|
||||
*/
|
||||
if (ddt->ddt_flags & DDT_FLAG_LOG) {
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
boolean_t from_flushing;
|
||||
|
||||
if (ddt_log_find_key(ddt, &search, &ddlwe)) {
|
||||
/*
|
||||
* See if we have the key first, and if so, set up
|
||||
* the entry.
|
||||
*/
|
||||
/* Read-only search, no locks needed (logs stable during I/O) */
|
||||
if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
|
||||
dde->dde_type = ddlwe.ddlwe_type;
|
||||
dde->dde_class = ddlwe.ddlwe_class;
|
||||
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
|
||||
DDT_PHYS_SIZE(ddt));
|
||||
/* Whatever we found isn't valid for this BP, eject */
|
||||
if (verify &&
|
||||
!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
|
||||
|
||||
/*
|
||||
* Check validity. If invalid and no waiters, clean up
|
||||
* immediately. Otherwise continue setup for waiters.
|
||||
*/
|
||||
boolean_t valid = !verify ||
|
||||
ddt_entry_lookup_is_valid(ddt, bp, dde);
|
||||
ddt_enter(ddt);
|
||||
if (!valid && dde->dde_waiters == 0) {
|
||||
avl_remove(&ddt->ddt_tree, dde);
|
||||
ddt_free(ddt, dde);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/* Remove it and count it */
|
||||
if (ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_active, &search)) {
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
|
||||
} else {
|
||||
VERIFY(ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_flushing, &search));
|
||||
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
|
||||
if (from_flushing) {
|
||||
dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
|
||||
DDT_KSTAT_BUMP(ddt,
|
||||
dds_lookup_log_flushing_hit);
|
||||
} else {
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
|
||||
}
|
||||
|
||||
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
|
||||
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
|
||||
|
||||
return (dde);
|
||||
cv_broadcast(&dde->dde_cv);
|
||||
|
||||
return (valid ? dde : NULL);
|
||||
}
|
||||
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
|
||||
}
|
||||
|
||||
/*
|
||||
* ddt_tree is now stable, so unlock and let everyone else keep moving.
|
||||
* Anyone landing on this entry will find it without DDE_FLAG_LOADED,
|
||||
* and go to sleep waiting for it above.
|
||||
*/
|
||||
ddt_exit(ddt);
|
||||
|
||||
/* Search all store objects for the entry. */
|
||||
error = ENOENT;
|
||||
for (type = 0; type < DDT_TYPES; type++) {
|
||||
@@ -2354,6 +2345,19 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
|
||||
avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
|
||||
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
|
||||
/* If from flushing log, remove it. */
|
||||
if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
|
||||
VERIFY(ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
|
||||
}
|
||||
|
||||
/* Update class_start to track last modification time */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
|
||||
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
|
||||
ddt_class_start();
|
||||
}
|
||||
|
||||
ddt_log_entry(ddt, &ddlwe, &dlu);
|
||||
ddt_sync_scan_entry(ddt, &ddlwe, tx);
|
||||
ddt_free(ddt, dde);
|
||||
@@ -2414,6 +2418,13 @@ ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
|
||||
/* Update class_start to track last modification time */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
|
||||
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
|
||||
ddt_class_start();
|
||||
}
|
||||
|
||||
ddt_sync_flush_entry(ddt, &ddlwe,
|
||||
dde->dde_type, dde->dde_class, tx);
|
||||
ddt_sync_scan_entry(ddt, &ddlwe, tx);
|
||||
@@ -2765,7 +2776,7 @@ ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
|
||||
* If this entry is on the log, then the stored entry is stale
|
||||
* and we should skip it.
|
||||
*/
|
||||
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
|
||||
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
|
||||
continue;
|
||||
|
||||
/* prune older entries */
|
||||
|
||||
+22
-10
@@ -252,7 +252,8 @@ ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
|
||||
boolean_t hist)
|
||||
{
|
||||
/* Create the log tree entry from a live or stored entry */
|
||||
avl_index_t where;
|
||||
@@ -262,7 +263,13 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
ddle = ddt_log_alloc_entry(ddt);
|
||||
ddle->ddle_key = ddlwe->ddlwe_key;
|
||||
avl_insert(&ddl->ddl_tree, ddle, where);
|
||||
} else if (hist) {
|
||||
ddt_lightweight_entry_t oddlwe;
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
|
||||
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
|
||||
}
|
||||
if (hist)
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
ddle->ddle_type = ddlwe->ddlwe_type;
|
||||
ddle->ddle_class = ddlwe->ddlwe_class;
|
||||
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
|
||||
@@ -273,8 +280,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
|
||||
{
|
||||
ASSERT3U(dlu->dlu_dbp, !=, NULL);
|
||||
|
||||
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
|
||||
|
||||
/* Get our block */
|
||||
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
|
||||
@@ -381,14 +387,20 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
|
||||
|
||||
boolean_t
|
||||
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
|
||||
ddt_lightweight_entry_t *ddlwe)
|
||||
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
|
||||
{
|
||||
ddt_log_entry_t *ddle =
|
||||
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
|
||||
if (!ddle)
|
||||
ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
|
||||
ddk, NULL);
|
||||
if (ddle) {
|
||||
if (from_flushing)
|
||||
*from_flushing = B_FALSE;
|
||||
} else {
|
||||
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
|
||||
if (!ddle)
|
||||
return (B_FALSE);
|
||||
if (!ddle)
|
||||
return (B_FALSE);
|
||||
if (from_flushing)
|
||||
*from_flushing = B_TRUE;
|
||||
}
|
||||
if (ddlwe)
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
|
||||
return (B_TRUE);
|
||||
@@ -524,7 +536,7 @@ ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
|
||||
ddlwe.ddlwe_key = dlre->dlre_key;
|
||||
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
|
||||
|
||||
ddt_log_update_entry(ddt, ddl, &ddlwe);
|
||||
ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
+7
-5
@@ -4067,19 +4067,21 @@ piggyback:
|
||||
|
||||
/*
|
||||
* We need to write. We will create a new write with the copies
|
||||
* property adjusted to match the number of DVAs we need to need to
|
||||
* grow the DDT entry by to satisfy the request.
|
||||
* property adjusted to match the number of DVAs we need to grow
|
||||
* the DDT entry by to satisfy the request.
|
||||
*/
|
||||
zio_prop_t czp = *zp;
|
||||
zio_prop_t czp;
|
||||
if (have_dvas > 0 || parent_dvas > 0) {
|
||||
czp = *zp;
|
||||
czp.zp_copies = need_dvas;
|
||||
czp.zp_gang_copies = 0;
|
||||
zp = &czp;
|
||||
} else {
|
||||
ASSERT3U(czp.zp_copies, ==, need_dvas);
|
||||
ASSERT3U(zp->zp_copies, ==, need_dvas);
|
||||
}
|
||||
|
||||
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||
zio->io_orig_size, zio->io_orig_size, &czp,
|
||||
zio->io_orig_size, zio->io_orig_size, zp,
|
||||
zio_ddt_child_write_ready, NULL,
|
||||
zio_ddt_child_write_done, dde, zio->io_priority,
|
||||
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
||||
|
||||
Reference in New Issue
Block a user