Make ganging redundancy respect redundant_metadata property (#17073)

The redundant_metadata setting in ZFS allows users to trade resilience
for performance and space savings. This applies to all data and metadata
blocks in zfs, with one exception: gang blocks. Gang blocks currently
just take the copies property of the IO being ganged and, if it's 1,
sets it to 2. This means that we always make at least two copies of a
gang header, which is good for resilience. However, if the users care
more about performance than resilience, their gang blocks will be even
more of a penalty than usual.

We add logic to calculate the number of gang headers copies directly,
and store it as a separate IO property. This is stored in the IO
properties and not calculated when we decide to gang because by that
point we may not have easy access to the relevant information about what
kind of block is being stored. We also check the redundant_metadata
property when doing so, and use that to decide whether to store an extra
copy of the gang headers, compared to the underlying blocks.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
Paul Dagnelie
2025-03-19 15:58:29 -07:00
committed by Alexander Motin
parent 90790955a6
commit a46ce73ca8
15 changed files with 327 additions and 20 deletions
+2
View File
@@ -7065,6 +7065,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
localprop.zp_nopwrite = B_FALSE;
localprop.zp_copies =
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
localprop.zp_gang_copies =
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (ARC_BUF_COMPRESSED(buf)) {
+2 -2
View File
@@ -5364,8 +5364,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
dr->dt.dl.dr_brtwrite);
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+20 -1
View File
@@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
/*
* Old style holes are filled with all zeros, whereas
@@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup_verify = os->os_dedup_verify;
boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
int gang_copies = os->os_copies;
/*
* We maintain different write policies for each of the following
@@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
switch (os->os_redundant_metadata) {
case ZFS_REDUNDANT_METADATA_ALL:
copies++;
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_MOST:
if (level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
copies++;
if (level + 1 >=
zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_SOME:
if (DMU_OT_IS_CRITICAL(type))
if (DMU_OT_IS_CRITICAL(type)) {
copies++;
gang_copies++;
} else if (DMU_OT_IS_METADATA(type)) {
gang_copies++;
}
break;
case ZFS_REDUNDANT_METADATA_NONE:
break;
@@ -2445,6 +2456,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
(os->os_redundant_metadata ==
ZFS_REDUNDANT_METADATA_MOST &&
zfs_redundant_metadata_most_ditto_level <= 1))
gang_copies++;
}
/*
@@ -2461,6 +2478,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (DMU_OT_IS_ENCRYPTED(type)) {
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
nopwrite = B_FALSE;
} else {
dedup = B_FALSE;
@@ -2478,6 +2496,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
+3
View File
@@ -2310,6 +2310,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
zp.zp_nopwrite = B_FALSE;
zp.zp_copies = MIN(zp.zp_copies,
SPA_DVAS_PER_BP - 1);
zp.zp_gang_copies =
MIN(zp.zp_gang_copies,
SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (DRR_WRITE_COMPRESSED(drrw)) {
+12 -11
View File
@@ -1415,8 +1415,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
}
void
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
boolean_t brtwrite)
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
boolean_t nopwrite, boolean_t brtwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -1433,6 +1433,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_brtwrite = brtwrite;
zio->io_prop.zp_copies = copies;
zio->io_prop.zp_gang_copies = gang_copies;
zio->io_bp_override = bp;
}
@@ -3144,15 +3145,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
/*
* If one copy was requested, store 2 copies of the GBH, so that we
* can still traverse all the data (e.g. to free or scrub) even if a
* block is damaged. Note that we can't store 3 copies of the GBH in
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
* Store multiple copies of the GBH, so that we can still traverse
* all the data (e.g. to free or scrub) even if a block is damaged.
* This value respects the redundant_metadata property.
*/
int gbh_copies = copies;
if (gbh_copies == 1) {
gbh_copies = MIN(2, spa_max_replication(spa));
}
int gbh_copies = gio->io_prop.zp_gang_copies;
ASSERT3S(gbh_copies, >, 0);
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
@@ -3172,6 +3171,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* since metaslab_class_throttle_reserve() always allows
* additional reservations for gang blocks.
*/
ASSERT3U(gbh_copies, >=, copies);
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
pio->io_allocator, pio, flags));
}
@@ -3234,6 +3234,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
@@ -3954,7 +3955,7 @@ zio_ddt_write(zio_t *zio)
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = need_dvas;
czp.zp_copies = czp.zp_gang_copies = need_dvas;
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL,