mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
Make ganging redundancy respect redundant_metadata property (#17073)
The redundant_metadata setting in ZFS allows users to trade resilience for performance and space savings. This applies to all data and metadata blocks in zfs, with one exception: gang blocks. Gang blocks currently just take the copies property of the IO being ganged and, if it's 1, sets it to 2. This means that we always make at least two copies of a gang header, which is good for resilience. However, if the users care more about performance than resilience, their gang blocks will be even more of a penalty than usual. We add logic to calculate the number of gang headers copies directly, and store it as a separate IO property. This is stored in the IO properties and not calculated when we decide to gang because by that point we may not have easy access to the relevant information about what kind of block is being stored. We also check the redundant_metadata property when doing so, and use that to decide whether to store an extra copy of the gang headers, compared to the underlying blocks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Co-authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
committed by
Alexander Motin
parent
90790955a6
commit
a46ce73ca8
@@ -7065,6 +7065,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
|
||||
localprop.zp_nopwrite = B_FALSE;
|
||||
localprop.zp_copies =
|
||||
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
|
||||
localprop.zp_gang_copies =
|
||||
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
|
||||
}
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
} else if (ARC_BUF_COMPRESSED(buf)) {
|
||||
|
||||
+2
-2
@@ -5364,8 +5364,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
||||
mutex_enter(&db->db_mtx);
|
||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
||||
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
|
||||
dr->dt.dl.dr_brtwrite);
|
||||
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
|
||||
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
|
||||
mutex_exit(&db->db_mtx);
|
||||
} else if (data == NULL) {
|
||||
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
|
||||
|
||||
+20
-1
@@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
||||
dr->dt.dl.dr_overridden_by = *zio->io_bp;
|
||||
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
|
||||
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
|
||||
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
|
||||
|
||||
/*
|
||||
* Old style holes are filled with all zeros, whereas
|
||||
@@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
boolean_t dedup_verify = os->os_dedup_verify;
|
||||
boolean_t encrypt = B_FALSE;
|
||||
int copies = os->os_copies;
|
||||
int gang_copies = os->os_copies;
|
||||
|
||||
/*
|
||||
* We maintain different write policies for each of the following
|
||||
@@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
switch (os->os_redundant_metadata) {
|
||||
case ZFS_REDUNDANT_METADATA_ALL:
|
||||
copies++;
|
||||
gang_copies++;
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_MOST:
|
||||
if (level >= zfs_redundant_metadata_most_ditto_level ||
|
||||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
|
||||
copies++;
|
||||
if (level + 1 >=
|
||||
zfs_redundant_metadata_most_ditto_level ||
|
||||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
|
||||
gang_copies++;
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_SOME:
|
||||
if (DMU_OT_IS_CRITICAL(type))
|
||||
if (DMU_OT_IS_CRITICAL(type)) {
|
||||
copies++;
|
||||
gang_copies++;
|
||||
} else if (DMU_OT_IS_METADATA(type)) {
|
||||
gang_copies++;
|
||||
}
|
||||
break;
|
||||
case ZFS_REDUNDANT_METADATA_NONE:
|
||||
break;
|
||||
@@ -2445,6 +2456,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
|
||||
ZCHECKSUM_FLAG_NOPWRITE) &&
|
||||
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
|
||||
|
||||
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
|
||||
(os->os_redundant_metadata ==
|
||||
ZFS_REDUNDANT_METADATA_MOST &&
|
||||
zfs_redundant_metadata_most_ditto_level <= 1))
|
||||
gang_copies++;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2461,6 +2478,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
|
||||
if (DMU_OT_IS_ENCRYPTED(type)) {
|
||||
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
|
||||
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
|
||||
nopwrite = B_FALSE;
|
||||
} else {
|
||||
dedup = B_FALSE;
|
||||
@@ -2478,6 +2496,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
||||
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
|
||||
zp->zp_level = level;
|
||||
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
|
||||
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
|
||||
zp->zp_dedup = dedup;
|
||||
zp->zp_dedup_verify = dedup && dedup_verify;
|
||||
zp->zp_nopwrite = nopwrite;
|
||||
|
||||
@@ -2310,6 +2310,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
|
||||
zp.zp_nopwrite = B_FALSE;
|
||||
zp.zp_copies = MIN(zp.zp_copies,
|
||||
SPA_DVAS_PER_BP - 1);
|
||||
zp.zp_gang_copies =
|
||||
MIN(zp.zp_gang_copies,
|
||||
SPA_DVAS_PER_BP - 1);
|
||||
}
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
} else if (DRR_WRITE_COMPRESSED(drrw)) {
|
||||
|
||||
+12
-11
@@ -1415,8 +1415,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
|
||||
}
|
||||
|
||||
void
|
||||
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
|
||||
boolean_t brtwrite)
|
||||
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
|
||||
boolean_t nopwrite, boolean_t brtwrite)
|
||||
{
|
||||
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
|
||||
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
||||
@@ -1433,6 +1433,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
|
||||
zio->io_prop.zp_nopwrite = nopwrite;
|
||||
zio->io_prop.zp_brtwrite = brtwrite;
|
||||
zio->io_prop.zp_copies = copies;
|
||||
zio->io_prop.zp_gang_copies = gang_copies;
|
||||
zio->io_bp_override = bp;
|
||||
}
|
||||
|
||||
@@ -3144,15 +3145,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
||||
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
|
||||
|
||||
/*
|
||||
* If one copy was requested, store 2 copies of the GBH, so that we
|
||||
* can still traverse all the data (e.g. to free or scrub) even if a
|
||||
* block is damaged. Note that we can't store 3 copies of the GBH in
|
||||
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
|
||||
* Store multiple copies of the GBH, so that we can still traverse
|
||||
* all the data (e.g. to free or scrub) even if a block is damaged.
|
||||
* This value respects the redundant_metadata property.
|
||||
*/
|
||||
int gbh_copies = copies;
|
||||
if (gbh_copies == 1) {
|
||||
gbh_copies = MIN(2, spa_max_replication(spa));
|
||||
}
|
||||
int gbh_copies = gio->io_prop.zp_gang_copies;
|
||||
ASSERT3S(gbh_copies, >, 0);
|
||||
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
|
||||
|
||||
ASSERT(ZIO_HAS_ALLOCATOR(pio));
|
||||
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
|
||||
@@ -3172,6 +3171,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
||||
* since metaslab_class_throttle_reserve() always allows
|
||||
* additional reservations for gang blocks.
|
||||
*/
|
||||
ASSERT3U(gbh_copies, >=, copies);
|
||||
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
|
||||
pio->io_allocator, pio, flags));
|
||||
}
|
||||
@@ -3234,6 +3234,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
||||
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
|
||||
zp.zp_level = 0;
|
||||
zp.zp_copies = gio->io_prop.zp_copies;
|
||||
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
|
||||
zp.zp_dedup = B_FALSE;
|
||||
zp.zp_dedup_verify = B_FALSE;
|
||||
zp.zp_nopwrite = B_FALSE;
|
||||
@@ -3954,7 +3955,7 @@ zio_ddt_write(zio_t *zio)
|
||||
* grow the DDT entry by to satisfy the request.
|
||||
*/
|
||||
zio_prop_t czp = *zp;
|
||||
czp.zp_copies = need_dvas;
|
||||
czp.zp_copies = czp.zp_gang_copies = need_dvas;
|
||||
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
|
||||
zio->io_orig_size, zio->io_orig_size, &czp,
|
||||
zio_ddt_child_write_ready, NULL,
|
||||
|
||||
Reference in New Issue
Block a user