Add BRT support to zpool prefetch command

Implement BRT (Block Reference Table) prefetch functionality similar
to existing DDT prefetch.  This allows preloading BRT metadata into
ARC to improve performance for block cloning operations and frees
of earlier cloned blocks.

Make -t parameter optional.  When omitted, prefetch all supported
metadata types (both DDT and BRT now).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <alexander.motin@TrueNAS.com>
Closes #17890
This commit is contained in:
Alexander Motin 2025-11-10 19:16:22 -05:00 committed by Brian Behlendorf
parent 002bc3da6a
commit 41878d57ea
12 changed files with 198 additions and 42 deletions

View File

@ -494,8 +494,7 @@ get_usage(zpool_help_t idx)
"[--json-int, --json-pool-key-guid]] ...\n"
"\t [-T d|u] [pool] [interval [count]]\n"));
case HELP_PREFETCH:
return (gettext("\tprefetch -t <type> [<type opts>] <pool>\n"
"\t -t ddt <pool>\n"));
return (gettext("\tprefetch [-t <type>] <pool>\n"));
case HELP_OFFLINE:
return (gettext("\toffline [--power]|[[-f][-t]] <pool> "
"<device> ...\n"));
@ -4200,7 +4199,7 @@ zpool_do_checkpoint(int argc, char **argv)
#define CHECKPOINT_OPT 1024
/*
* zpool prefetch <type> [<type opts>] <pool>
* zpool prefetch [-t <type>] <pool>
*
* Prefetchs a particular type of data in the specified pool.
*/
@ -4245,20 +4244,27 @@ zpool_do_prefetch(int argc, char **argv)
poolname = argv[0];
argc--;
argv++;
if (strcmp(typestr, "ddt") == 0) {
type = ZPOOL_PREFETCH_DDT;
} else {
(void) fprintf(stderr, gettext("unsupported prefetch type\n"));
usage(B_FALSE);
}
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
err = zpool_prefetch(zhp, type);
if (typestr == NULL) {
/* Prefetch all types */
err = zpool_prefetch(zhp, ZPOOL_PREFETCH_DDT);
if (err == 0)
err = zpool_prefetch(zhp, ZPOOL_PREFETCH_BRT);
} else {
if (strcmp(typestr, "ddt") == 0) {
type = ZPOOL_PREFETCH_DDT;
} else if (strcmp(typestr, "brt") == 0) {
type = ZPOOL_PREFETCH_BRT;
} else {
(void) fprintf(stderr,
gettext("unsupported prefetch type\n"));
zpool_close(zhp);
usage(B_FALSE);
}
err = zpool_prefetch(zhp, type);
}
zpool_close(zhp);

View File

@ -56,6 +56,7 @@ extern void brt_create(spa_t *spa);
extern int brt_load(spa_t *spa);
extern void brt_unload(spa_t *spa);
extern void brt_sync(spa_t *spa, uint64_t txg);
extern void brt_prefetch_all(spa_t *spa);
#ifdef __cplusplus
}

View File

@ -1713,7 +1713,8 @@ typedef enum {
typedef enum {
ZPOOL_PREFETCH_NONE = 0,
ZPOOL_PREFETCH_DDT
ZPOOL_PREFETCH_DDT,
ZPOOL_PREFETCH_BRT
} zpool_prefetch_type_t;
typedef enum {

View File

@ -1745,9 +1745,13 @@ zpool_prefetch(zpool_handle_t *zhp, zpool_prefetch_type_t type)
error = lzc_pool_prefetch(zhp->zpool_name, type);
if (error != 0) {
const char *typename = "unknown";
if (type == ZPOOL_PREFETCH_DDT)
typename = "ddt";
else if (type == ZPOOL_PREFETCH_BRT)
typename = "brt";
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot prefetch %s in '%s'"),
type == ZPOOL_PREFETCH_DDT ? "ddt" : "", zhp->zpool_name);
"cannot prefetch %s in '%s'"), typename, zhp->zpool_name);
(void) zpool_standard_error(hdl, error, msg);
return (-1);
}

View File

@ -28,20 +28,25 @@
.
.Sh NAME
.Nm zpool-prefetch
.Nd Loads specific types of data for the given pool
.Nd Prefetches pool metadata into ARC
.Sh SYNOPSIS
.Nm zpool
.Cm prefetch
.Fl t Ar type
.Op Fl t Ar type
.Ar pool
.Sh DESCRIPTION
.Bl -tag -width Ds
.It Xo
.Nm zpool
.Cm prefetch
.Fl t Li ddt
.Ar pool
.Xc
Prefetch data of a specific type for the given pool; specifically the DDT,
which will improve write I/O performance when the DDT is resident in the ARC.
Massively prefetch metadata of a specific type for the given pool into the ARC
to reduce latency of some operations later.
If no type is specified, all types are prefetched.
.Pp
The following types are supported:
.Bl -tag -width "brt"
.It Sy brt
Prefetch the BRT (block reference table).
This may improve performance for block cloning operations,
and frees for earlier cloned blocks.
.It Sy ddt
Prefetch the DDT (deduplication table).
This may improve performance of writes when deduplication is enabled,
and frees for earlier deduplicated blocks.
.El

View File

@ -1510,6 +1510,31 @@ brt_load(spa_t *spa)
return (error);
}
void
brt_prefetch_all(spa_t *spa)
{
/*
* Load all BRT entries for each vdev. This is intended to perform
* a prefetch on all such blocks. For the same reason that brt_prefetch
* (called from brt_pending_add) isn't locked, this is also not locked.
*/
brt_rlock(spa);
for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
brt_unlock(spa);
rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
if (brtvd->bv_mos_entries != 0) {
(void) zap_prefetch_object(spa->spa_meta_objset,
brtvd->bv_mos_entries);
}
rw_exit(&brtvd->bv_mos_entries_lock);
brt_rlock(spa);
}
brt_unlock(spa);
}
void
brt_unload(spa_t *spa)
{

View File

@ -850,12 +850,15 @@ dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size)
return (err);
/*
* Chunk the requests (16 indirects worth) so that we can be interrupted
* Chunk the requests (16 indirects worth) so that we can be
* interrupted. Prefetch at least SPA_MAXBLOCKSIZE at a time
* to better utilize pools with smaller block sizes.
*/
uint64_t chunksize;
if (dn->dn_indblkshift) {
uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
chunksize = (nbps * 16) << dn->dn_datablkshift;
chunksize = MAX(chunksize, SPA_MAXBLOCKSIZE);
} else {
chunksize = dn->dn_datablksz;
}

View File

@ -212,6 +212,8 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/brt.h>
#include <sys/ddt.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@ -4276,13 +4278,11 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
spa_t *spa;
int32_t type;
/*
* Currently, only ZPOOL_PREFETCH_DDT is supported
*/
if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 ||
type != ZPOOL_PREFETCH_DDT) {
if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0)
return (EINVAL);
if (type != ZPOOL_PREFETCH_DDT && type != ZPOOL_PREFETCH_BRT)
return (EINVAL);
}
error = spa_open(poolname, &spa, FTAG);
if (error != 0)
@ -4290,10 +4290,17 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
hrtime_t start_time = gethrtime();
ddt_prefetch_all(spa);
zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name,
(u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
if (type == ZPOOL_PREFETCH_DDT) {
ddt_prefetch_all(spa);
zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms",
spa->spa_name,
(u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
} else {
brt_prefetch_all(spa);
zfs_dbgmsg("pool '%s': loaded brt into ARC in %llu ms",
spa->spa_name,
(u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
}
spa_close(spa, FTAG);

View File

@ -215,7 +215,7 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
tags = ['functional', 'cli_root', 'zfs_create']
[tests/functional/cli_root/zpool_prefetch]
tests = ['zpool_prefetch_001_pos']
tests = ['zpool_prefetch_001_pos', 'zpool_prefetch_002_pos']
tags = ['functional', 'cli_root', 'zpool_prefetch']
[tests/functional/cli_root/zfs_destroy]

View File

@ -1217,6 +1217,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_prefetch/cleanup.ksh \
functional/cli_root/zpool_prefetch/setup.ksh \
functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \
functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh \
functional/cli_root/zpool_reguid/cleanup.ksh \
functional/cli_root/zpool_reguid/setup.ksh \
functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \

View File

@ -42,6 +42,15 @@ verify_runnable "both"
log_assert "'zpool prefetch -t ddt <pool>' can successfully load the DDT for a pool."
DATASET=$TESTPOOL/ddt
function cleanup
{
datasetexists $DATASET && destroy_dataset $DATASET -f
}
log_onexit cleanup
function getddtstats
{
typeset -n gds=$1
@ -75,9 +84,8 @@ log_must zpool prefetch -t ddt $TESTPOOL
# Build up the deduplicated dataset. This consists of creating enough files
# to generate a reasonable size DDT for testing purposes.
DATASET=$TESTPOOL/ddt
log_must zfs create -o compression=off -o dedup=on $DATASET
MNTPOINT=$(get_prop mountpoint $TESTPOOL/ddt)
MNTPOINT=$(get_prop mountpoint $DATASET)
log_note "Generating dataset ..."
typeset -i i=0

View File

@ -0,0 +1,95 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025 by iXsystems, Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# 'zpool prefetch -t brt <pool>' can successfully load a pool's BRT on demand.
# 'zpool prefetch <pool>' without -t prefetches both DDT and BRT.
#
# STRATEGY:
# 1. Create a dataset with block cloning enabled.
# 2. Create files and clone them to populate the BRT.
# 3. Export and import the pool to flush caches.
# 4. Use zpool prefetch -t brt to load BRT.
# 5. Test zpool prefetch without -t to prefetch all types.
#
verify_runnable "both"
if ! command -v clonefile > /dev/null ; then
log_unsupported "clonefile program required to test block cloning"
fi
log_assert "'zpool prefetch' can successfully load BRT and prefetch all types"
DATASET=$TESTPOOL/brt
function cleanup
{
datasetexists $DATASET && destroy_dataset $DATASET -f
}
log_onexit cleanup
log_must zfs create $DATASET
MNTPOINT=$(get_prop mountpoint $DATASET)
log_note "Generating cloned blocks for BRT ..."
# Create source file
log_must dd if=/dev/urandom of=$MNTPOINT/source bs=1M count=100
# Create clones using clonefile
typeset -i i=0
while (( i < 50 )); do
log_must clonefile -f $MNTPOINT/source $MNTPOINT/clone.$i
((i += 1))
done
sync_pool $TESTPOOL
# Verify BRT has entries (non-zero saved space)
brt_saved=$(zpool get -Hp -o value bclone_saved $TESTPOOL)
log_note "BRT saved space: $brt_saved"
log_must test "$brt_saved" -gt "0"
# Export/import to flush caches
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
# Test BRT prefetch - verify command succeeds
# Note: BRT does not expose cache statistics like DDT, so we can only
# verify the prefetch command completes successfully
log_must zpool prefetch -t brt $TESTPOOL
# Test prefetch without -t (should prefetch all types including BRT)
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
log_must zpool prefetch $TESTPOOL
log_pass "'zpool prefetch' successfully loads BRT and all types"