diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 75f021636..77a233a6f 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -213,6 +213,7 @@ typedef enum { #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ #define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ +#define DDE_FLAG_FROM_FLUSHING (1 << 3) /* loaded from flushing log */ /* * Additional data to support entry update or repair. This is fixed size @@ -280,13 +281,14 @@ typedef struct { */ typedef struct { kmutex_t ddt_lock; /* protects changes to all fields */ - avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ - avl_tree_t ddt_log_tree; /* logged entries */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ - ddt_log_t ddt_log[2]; /* active/flushing logs */ + /* + * Log trees are stable during I/O, and only modified during sync + * with exclusive access. + */ + ddt_log_t ddt_log[2] ____cacheline_aligned; /* logged entries */ ddt_log_t *ddt_log_active; /* pointers into ddt_log */ ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 22bd1b2ad..e33b63358 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -69,8 +69,8 @@ extern "C" { * the live tree. */ typedef struct { - ddt_key_t ddle_key; /* ddt_log_tree key */ - avl_node_t ddle_node; /* ddt_log_tree node */ + ddt_key_t ddle_key; /* ddl_tree key */ + avl_node_t ddle_node; /* ddl_tree node */ ddt_type_t ddle_type; /* storage type */ ddt_class_t ddle_class; /* storage class */ @@ -193,7 +193,7 @@ extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe); extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, - ddt_lightweight_entry_t *ddlwe); + ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing); extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 42d399289..2c7e2c550 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1037,13 +1037,6 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); - /* Entry is still in the log, so charge the entry back to it */ - if (dde->dde_flags & DDE_FLAG_LOGGED) { - ddt_lightweight_entry_t ddlwe; - DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); - ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); - } - avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); } @@ -1234,63 +1227,61 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify) /* Time to make a new entry. */ dde = ddt_alloc(ddt, &search); - - /* Record the time this class was created (used by ddt prune) */ - if (ddt->ddt_flags & DDT_FLAG_FLAT) - dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); - avl_insert(&ddt->ddt_tree, dde, where); - /* If its in the log tree, we can "load" it from there */ + /* + * The entry in ddt_tree has no DDE_FLAG_LOADED, so other possible + * threads will wait even while we drop the lock. + */ + ddt_exit(ddt); + + /* + * If there is a log, we should try to "load" from there first. + */ if (ddt->ddt_flags & DDT_FLAG_LOG) { ddt_lightweight_entry_t ddlwe; + boolean_t from_flushing; - if (ddt_log_find_key(ddt, &search, &ddlwe)) { - /* - * See if we have the key first, and if so, set up - * the entry. - */ + /* Read-only search, no locks needed (logs stable during I/O) */ + if (ddt_log_find_key(ddt, &search, &ddlwe, &from_flushing)) { dde->dde_type = ddlwe.ddlwe_type; dde->dde_class = ddlwe.ddlwe_class; memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, DDT_PHYS_SIZE(ddt)); - /* Whatever we found isn't valid for this BP, eject */ - if (verify && - !ddt_entry_lookup_is_valid(ddt, bp, dde)) { + + /* + * Check validity. If invalid and no waiters, clean up + * immediately. Otherwise continue setup for waiters. + */ + boolean_t valid = !verify || + ddt_entry_lookup_is_valid(ddt, bp, dde); + ddt_enter(ddt); + if (!valid && dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); ddt_free(ddt, dde); return (NULL); } - /* Remove it and count it */ - if (ddt_log_remove_key(ddt, - ddt->ddt_log_active, &search)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); - } else { - VERIFY(ddt_log_remove_key(ddt, - ddt->ddt_log_flushing, &search)); + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + if (from_flushing) { + dde->dde_flags |= DDE_FLAG_FROM_FLUSHING; DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit); + } else { + DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); } - dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; - DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); - return (dde); + cv_broadcast(&dde->dde_cv); + + return (valid ? dde : NULL); } DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss); } - /* - * ddt_tree is now stable, so unlock and let everyone else keep moving. - * Anyone landing on this entry will find it without DDE_FLAG_LOADED, - * and go to sleep waiting for it above. - */ - ddt_exit(ddt); - /* Search all store objects for the entry. */ error = ENOENT; for (type = 0; type < DDT_TYPES; type++) { @@ -2354,6 +2345,19 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx) avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ASSERT(dde->dde_flags & DDE_FLAG_LOADED); DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + + /* If from flushing log, remove it. */ + if (dde->dde_flags & DDE_FLAG_FROM_FLUSHING) { + VERIFY(ddt_log_remove_key(ddt, + ddt->ddt_log_flushing, &ddlwe.ddlwe_key)); + } + + /* Update class_start to track last modification time */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start = + ddt_class_start(); + } + ddt_log_entry(ddt, &ddlwe, &dlu); ddt_sync_scan_entry(ddt, &ddlwe, tx); ddt_free(ddt, dde); @@ -2414,6 +2418,13 @@ ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx) ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); + + /* Update class_start to track last modification time */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start = + ddt_class_start(); + } + ddt_sync_flush_entry(ddt, &ddlwe, dde->dde_type, dde->dde_class, tx); ddt_sync_scan_entry(ddt, &ddlwe, tx); @@ -2765,7 +2776,7 @@ ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) * If this entry is on the log, then the stored entry is stale * and we should skip it. */ - if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) + if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL, NULL)) continue; /* prune older entries */ diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index 3d42c5136..4173f8f57 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -252,7 +252,8 @@ ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) } static void -ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) +ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe, + boolean_t hist) { /* Create the log tree entry from a live or stored entry */ avl_index_t where; @@ -262,7 +263,13 @@ ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) ddle = ddt_log_alloc_entry(ddt); ddle->ddle_key = ddlwe->ddlwe_key; avl_insert(&ddl->ddl_tree, ddle, where); + } else if (hist) { + ddt_lightweight_entry_t oddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe); + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe); } + if (hist) + ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); ddle->ddle_type = ddlwe->ddlwe_type; ddle->ddle_class = ddlwe->ddlwe_class; memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); @@ -273,8 +280,7 @@ ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) { ASSERT3U(dlu->dlu_dbp, !=, NULL); - ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); - ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE); /* Get our block */ ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); @@ -381,14 +387,20 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, - ddt_lightweight_entry_t *ddlwe) + ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing) { - ddt_log_entry_t *ddle = - avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); - if (!ddle) + ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree, + ddk, NULL); + if (ddle) { + if (from_flushing) + *from_flushing = B_FALSE; + } else { ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); - if (!ddle) - return (B_FALSE); + if (!ddle) + return (B_FALSE); + if (from_flushing) + *from_flushing = B_TRUE; + } if (ddlwe) DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); return (B_TRUE); @@ -524,7 +536,7 @@ ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, ddlwe.ddlwe_key = dlre->dlre_key; memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); - ddt_log_update_entry(ddt, ddl, &ddlwe); + ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE); } static void diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 77383df0e..a48854563 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4067,19 +4067,21 @@ piggyback: /* * We need to write. We will create a new write with the copies - * property adjusted to match the number of DVAs we need to need to - * grow the DDT entry by to satisfy the request. + * property adjusted to match the number of DVAs we need to grow + * the DDT entry by to satisfy the request. */ - zio_prop_t czp = *zp; + zio_prop_t czp; if (have_dvas > 0 || parent_dvas > 0) { + czp = *zp; czp.zp_copies = need_dvas; czp.zp_gang_copies = 0; + zp = &czp; } else { - ASSERT3U(czp.zp_copies, ==, need_dvas); + ASSERT3U(zp->zp_copies, ==, need_dvas); } zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, &czp, + zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);