mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-03-10 12:26:27 +03:00
DDT: Move logs searches out of the lock
Postponing entry removal from the DDT log in case of hit till later single-threaded sync stage allows to make ddl_tree stable during multi-threaded ZIO processing stage. It allows to drop the DDT lock before the search instead of after, reducing the contention a lot. Actually ddt_log_update_entry() was already handling the case of entry present in the active log, so we only need to remove it from flushing log, if the entry happen to be there. My tests with parallel 4KB block writes show throughput increase from 480MB/s (122K blocks/s) to 827MB/s (212K blocks/s), even though still limited by the global DDT lock contention. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com> Closes #18044
This commit is contained in:
parent
3d76ba2737
commit
46d6f1fe56
@ -213,6 +213,7 @@ typedef enum {
|
||||
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
|
||||
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */
|
||||
#define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */
|
||||
#define DDE_FLAG_FROM_FLUSHING (1 << 3) /* loaded from flushing log */
|
||||
|
||||
/*
|
||||
* Additional data to support entry update or repair. This is fixed size
|
||||
@ -280,13 +281,14 @@ typedef struct {
|
||||
*/
|
||||
typedef struct {
|
||||
kmutex_t ddt_lock; /* protects changes to all fields */
|
||||
|
||||
avl_tree_t ddt_tree; /* "live" (changed) entries this txg */
|
||||
avl_tree_t ddt_log_tree; /* logged entries */
|
||||
|
||||
avl_tree_t ddt_repair_tree; /* entries being repaired */
|
||||
|
||||
ddt_log_t ddt_log[2]; /* active/flushing logs */
|
||||
/*
|
||||
* Log trees are stable during I/O, and only modified during sync
|
||||
* with exclusive access.
|
||||
*/
|
||||
ddt_log_t ddt_log[2] ____cacheline_aligned; /* logged entries */
|
||||
ddt_log_t *ddt_log_active; /* pointers into ddt_log */
|
||||
ddt_log_t *ddt_log_flushing; /* swapped when flush starts */
|
||||
|
||||
|
||||
@ -69,8 +69,8 @@ extern "C" {
|
||||
* the live tree.
|
||||
*/
|
||||
typedef struct {
|
||||
ddt_key_t ddle_key; /* ddt_log_tree key */
|
||||
avl_node_t ddle_node; /* ddt_log_tree node */
|
||||
ddt_key_t ddle_key; /* ddl_tree key */
|
||||
avl_node_t ddle_node; /* ddl_tree node */
|
||||
|
||||
ddt_type_t ddle_type; /* storage type */
|
||||
ddt_class_t ddle_class; /* storage class */
|
||||
@ -193,7 +193,7 @@ extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
|
||||
ddt_lightweight_entry_t *ddlwe);
|
||||
|
||||
extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
|
||||
ddt_lightweight_entry_t *ddlwe);
|
||||
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing);
|
||||
extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
|
||||
const ddt_key_t *ddk);
|
||||
|
||||
|
||||
@ -1037,13 +1037,6 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
|
||||
|
||||
/* Entry is still in the log, so charge the entry back to it */
|
||||
if (dde->dde_flags & DDE_FLAG_LOGGED) {
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
|
||||
}
|
||||
|
||||
avl_remove(&ddt->ddt_tree, dde);
|
||||
ddt_free(ddt, dde);
|
||||
}
|
||||
@ -1234,63 +1227,61 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
|
||||
|
||||
/* Time to make a new entry. */
|
||||
dde = ddt_alloc(ddt, &search);
|
||||
|
||||
/* Record the time this class was created (used by ddt prune) */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT)
|
||||
dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
|
||||
|
||||
avl_insert(&ddt->ddt_tree, dde, where);
|
||||
|
||||
/* If its in the log tree, we can "load" it from there */
|
||||
/*
|
||||
* The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible
|
||||
* threads will wait even while we drop the lock.
|
||||
*/
|
||||
ddt_exit(ddt);
|
||||
|
||||
/*
|
||||
* If there is a log, we should try to "load" from there first.
|
||||
*/
|
||||
if (ddt->ddt_flags & DDT_FLAG_LOG) {
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
boolean_t from_flushing;
|
||||
|
||||
if (ddt_log_find_key(ddt, &search, &ddlwe)) {
|
||||
/*
|
||||
* See if we have the key first, and if so, set up
|
||||
* the entry.
|
||||
*/
|
||||
/* Read-only search, no locks needed (logs stable during I/O) */
|
||||
if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) {
|
||||
dde->dde_type = ddlwe.ddlwe_type;
|
||||
dde->dde_class = ddlwe.ddlwe_class;
|
||||
memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
|
||||
DDT_PHYS_SIZE(ddt));
|
||||
/* Whatever we found isn't valid for this BP, eject */
|
||||
if (verify &&
|
||||
!ddt_entry_lookup_is_valid(ddt, bp, dde)) {
|
||||
|
||||
/*
|
||||
* Check validity. If invalid and no waiters, clean up
|
||||
* immediately. Otherwise continue setup for waiters.
|
||||
*/
|
||||
boolean_t valid = !verify ||
|
||||
ddt_entry_lookup_is_valid(ddt, bp, dde);
|
||||
ddt_enter(ddt);
|
||||
if (!valid && dde->dde_waiters == 0) {
|
||||
avl_remove(&ddt->ddt_tree, dde);
|
||||
ddt_free(ddt, dde);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
/* Remove it and count it */
|
||||
if (ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_active, &search)) {
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
|
||||
} else {
|
||||
VERIFY(ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_flushing, &search));
|
||||
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
|
||||
if (from_flushing) {
|
||||
dde->dde_flags |= DDE_FLAG_FROM_FLUSHING;
|
||||
DDT_KSTAT_BUMP(ddt,
|
||||
dds_lookup_log_flushing_hit);
|
||||
} else {
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
|
||||
}
|
||||
|
||||
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
|
||||
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
|
||||
|
||||
return (dde);
|
||||
cv_broadcast(&dde->dde_cv);
|
||||
|
||||
return (valid ? dde : NULL);
|
||||
}
|
||||
|
||||
DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
|
||||
}
|
||||
|
||||
/*
|
||||
* ddt_tree is now stable, so unlock and let everyone else keep moving.
|
||||
* Anyone landing on this entry will find it without DDE_FLAG_LOADED,
|
||||
* and go to sleep waiting for it above.
|
||||
*/
|
||||
ddt_exit(ddt);
|
||||
|
||||
/* Search all store objects for the entry. */
|
||||
error = ENOENT;
|
||||
for (type = 0; type < DDT_TYPES; type++) {
|
||||
@ -2354,6 +2345,19 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
|
||||
avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
|
||||
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
|
||||
/* If from flushing log, remove it. */
|
||||
if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) {
|
||||
VERIFY(ddt_log_remove_key(ddt,
|
||||
ddt->ddt_log_flushing, &ddlwe.ddlwe_key));
|
||||
}
|
||||
|
||||
/* Update class_start to track last modification time */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
|
||||
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
|
||||
ddt_class_start();
|
||||
}
|
||||
|
||||
ddt_log_entry(ddt, &ddlwe, &dlu);
|
||||
ddt_sync_scan_entry(ddt, &ddlwe, tx);
|
||||
ddt_free(ddt, dde);
|
||||
@ -2414,6 +2418,13 @@ ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
|
||||
|
||||
ddt_lightweight_entry_t ddlwe;
|
||||
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
|
||||
|
||||
/* Update class_start to track last modification time */
|
||||
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
|
||||
ddlwe.ddlwe_phys.ddp_flat.ddp_class_start =
|
||||
ddt_class_start();
|
||||
}
|
||||
|
||||
ddt_sync_flush_entry(ddt, &ddlwe,
|
||||
dde->dde_type, dde->dde_class, tx);
|
||||
ddt_sync_scan_entry(ddt, &ddlwe, tx);
|
||||
@ -2765,7 +2776,7 @@ ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
|
||||
* If this entry is on the log, then the stored entry is stale
|
||||
* and we should skip it.
|
||||
*/
|
||||
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
|
||||
if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL))
|
||||
continue;
|
||||
|
||||
/* prune older entries */
|
||||
|
||||
@ -252,7 +252,8 @@ ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
|
||||
}
|
||||
|
||||
static void
|
||||
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
|
||||
boolean_t hist)
|
||||
{
|
||||
/* Create the log tree entry from a live or stored entry */
|
||||
avl_index_t where;
|
||||
@ -262,7 +263,13 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
|
||||
ddle = ddt_log_alloc_entry(ddt);
|
||||
ddle->ddle_key = ddlwe->ddlwe_key;
|
||||
avl_insert(&ddl->ddl_tree, ddle, where);
|
||||
} else if (hist) {
|
||||
ddt_lightweight_entry_t oddlwe;
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
|
||||
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
|
||||
}
|
||||
if (hist)
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
ddle->ddle_type = ddlwe->ddlwe_type;
|
||||
ddle->ddle_class = ddlwe->ddlwe_class;
|
||||
memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
|
||||
@ -273,8 +280,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
|
||||
{
|
||||
ASSERT3U(dlu->dlu_dbp, !=, NULL);
|
||||
|
||||
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
|
||||
ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
|
||||
ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
|
||||
|
||||
/* Get our block */
|
||||
ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
|
||||
@ -381,14 +387,20 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
|
||||
|
||||
boolean_t
|
||||
ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
|
||||
ddt_lightweight_entry_t *ddlwe)
|
||||
ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
|
||||
{
|
||||
ddt_log_entry_t *ddle =
|
||||
avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
|
||||
if (!ddle)
|
||||
ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
|
||||
ddk, NULL);
|
||||
if (ddle) {
|
||||
if (from_flushing)
|
||||
*from_flushing = B_FALSE;
|
||||
} else {
|
||||
ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
|
||||
if (!ddle)
|
||||
return (B_FALSE);
|
||||
if (!ddle)
|
||||
return (B_FALSE);
|
||||
if (from_flushing)
|
||||
*from_flushing = B_TRUE;
|
||||
}
|
||||
if (ddlwe)
|
||||
DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
|
||||
return (B_TRUE);
|
||||
@ -524,7 +536,7 @@ ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
|
||||
ddlwe.ddlwe_key = dlre->dlre_key;
|
||||
memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
|
||||
|
||||
ddt_log_update_entry(ddt, ddl, &ddlwe);
|
||||
ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@ -4067,19 +4067,21 @@ piggyback:
|
||||
|
||||
/*
|
||||
* We need to write. We will create a new write with the copies
|
||||
* property adjusted to match the number of DVAs we need to need to
|
||||
* grow the DDT entry by to satisfy the request.
|
||||
* property adjusted to match the number of DVAs we need to grow
|
||||
* the DDT entry by to satisfy the request.
|
||||
*/
|
||||
zio_prop_t czp = *zp;
|
||||
zio_prop_t czp;
|
||||
if (have_dvas > 0 || parent_dvas > 0) {
|
||||
czp = *zp;
|
||||
czp.zp_copies = need_dvas;
|
||||
czp.zp_gang_copies = 0;
|
||||
zp = &czp;
|
||||
} else {
|
||||
ASSERT3U(czp.zp_copies, ==, need_dvas);
|
||||
ASSERT3U(zp->zp_copies, ==, need_dvas);
|
||||
}
|
||||
|
||||
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||
zio->io_orig_size, zio->io_orig_size, &czp,
|
||||
zio->io_orig_size, zio->io_orig_size, zp,
|
||||
zio_ddt_child_write_ready, NULL,
|
||||
zio_ddt_child_write_done, dde, zio->io_priority,
|
||||
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user