From 41878d57eaf3091c8405e80abb4b37ffe1746b39 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 10 Nov 2025 19:16:22 -0500 Subject: [PATCH] Add BRT support to zpool prefetch command Implement BRT (Block Reference Table) prefetch functionality similar to existing DDT prefetch. This allows preloading BRT metadata into ARC to improve performance for block cloning operations and frees of earlier cloned blocks. Make -t parameter optional. When omitted, prefetch all supported metadata types (both DDT and BRT now). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #17890 --- cmd/zpool/zpool_main.c | 34 ++++--- include/sys/brt.h | 1 + include/sys/fs/zfs.h | 3 +- lib/libzfs/libzfs_pool.c | 8 +- man/man8/zpool-prefetch.8 | 27 +++--- module/zfs/brt.c | 25 +++++ module/zfs/dmu.c | 5 +- module/zfs/zfs_ioctl.c | 27 ++++-- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_prefetch/zpool_prefetch_001_pos.ksh | 12 ++- .../zpool_prefetch/zpool_prefetch_002_pos.ksh | 95 +++++++++++++++++++ 12 files changed, 198 insertions(+), 42 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 1feec55c0..18952775b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -494,8 +494,7 @@ get_usage(zpool_help_t idx) "[--json-int, --json-pool-key-guid]] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_PREFETCH: - return (gettext("\tprefetch -t [] \n" - "\t -t ddt \n")); + return (gettext("\tprefetch [-t ] \n")); case HELP_OFFLINE: return (gettext("\toffline [--power]|[[-f][-t]] " " ...\n")); @@ -4200,7 +4199,7 @@ zpool_do_checkpoint(int argc, char **argv) #define CHECKPOINT_OPT 1024 /* - * zpool prefetch [] + * zpool prefetch [-t ] * * Prefetchs a particular type of data in the specified pool. */ @@ -4245,20 +4244,27 @@ zpool_do_prefetch(int argc, char **argv) poolname = argv[0]; - argc--; - argv++; - - if (strcmp(typestr, "ddt") == 0) { - type = ZPOOL_PREFETCH_DDT; - } else { - (void) fprintf(stderr, gettext("unsupported prefetch type\n")); - usage(B_FALSE); - } - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - err = zpool_prefetch(zhp, type); + if (typestr == NULL) { + /* Prefetch all types */ + err = zpool_prefetch(zhp, ZPOOL_PREFETCH_DDT); + if (err == 0) + err = zpool_prefetch(zhp, ZPOOL_PREFETCH_BRT); + } else { + if (strcmp(typestr, "ddt") == 0) { + type = ZPOOL_PREFETCH_DDT; + } else if (strcmp(typestr, "brt") == 0) { + type = ZPOOL_PREFETCH_BRT; + } else { + (void) fprintf(stderr, + gettext("unsupported prefetch type\n")); + zpool_close(zhp); + usage(B_FALSE); + } + err = zpool_prefetch(zhp, type); + } zpool_close(zhp); diff --git a/include/sys/brt.h b/include/sys/brt.h index d7c1814b0..2a23a6a7f 100644 --- a/include/sys/brt.h +++ b/include/sys/brt.h @@ -56,6 +56,7 @@ extern void brt_create(spa_t *spa); extern int brt_load(spa_t *spa); extern void brt_unload(spa_t *spa); extern void brt_sync(spa_t *spa, uint64_t txg); +extern void brt_prefetch_all(spa_t *spa); #ifdef __cplusplus } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 662fd81c5..aa7421261 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1713,7 +1713,8 @@ typedef enum { typedef enum { ZPOOL_PREFETCH_NONE = 0, - ZPOOL_PREFETCH_DDT + ZPOOL_PREFETCH_DDT, + ZPOOL_PREFETCH_BRT } zpool_prefetch_type_t; typedef enum { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ce154ae1a..756d701e2 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1745,9 +1745,13 @@ zpool_prefetch(zpool_handle_t *zhp, zpool_prefetch_type_t type) error = lzc_pool_prefetch(zhp->zpool_name, type); if (error != 0) { + const char *typename = "unknown"; + if (type == ZPOOL_PREFETCH_DDT) + typename = "ddt"; + else if (type == ZPOOL_PREFETCH_BRT) + typename = "brt"; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, - "cannot prefetch %s in '%s'"), - type == ZPOOL_PREFETCH_DDT ? "ddt" : "", zhp->zpool_name); + "cannot prefetch %s in '%s'"), typename, zhp->zpool_name); (void) zpool_standard_error(hdl, error, msg); return (-1); } diff --git a/man/man8/zpool-prefetch.8 b/man/man8/zpool-prefetch.8 index a36ad52e6..6f4c3b129 100644 --- a/man/man8/zpool-prefetch.8 +++ b/man/man8/zpool-prefetch.8 @@ -28,20 +28,25 @@ . .Sh NAME .Nm zpool-prefetch -.Nd Loads specific types of data for the given pool +.Nd Prefetches pool metadata into ARC .Sh SYNOPSIS .Nm zpool .Cm prefetch -.Fl t Ar type +.Op Fl t Ar type .Ar pool .Sh DESCRIPTION -.Bl -tag -width Ds -.It Xo -.Nm zpool -.Cm prefetch -.Fl t Li ddt -.Ar pool -.Xc -Prefetch data of a specific type for the given pool; specifically the DDT, -which will improve write I/O performance when the DDT is resident in the ARC. +Massively prefetch metadata of a specific type for the given pool into the ARC +to reduce latency of some operations later. +If no type is specified, all types are prefetched. +.Pp +The following types are supported: +.Bl -tag -width "brt" +.It Sy brt +Prefetch the BRT (block reference table). +This may improve performance for block cloning operations, +and frees for earlier cloned blocks. +.It Sy ddt +Prefetch the DDT (deduplication table). +This may improve performance of writes when deduplication is enabled, +and frees for earlier deduplicated blocks. .El diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 60f42116c..08a6bd52a 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -1510,6 +1510,31 @@ brt_load(spa_t *spa) return (error); } +void +brt_prefetch_all(spa_t *spa) +{ + /* + * Load all BRT entries for each vdev. This is intended to perform + * a prefetch on all such blocks. For the same reason that brt_prefetch + * (called from brt_pending_add) isn't locked, this is also not locked. + */ + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); + + rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); + if (brtvd->bv_mos_entries != 0) { + (void) zap_prefetch_object(spa->spa_meta_objset, + brtvd->bv_mos_entries); + } + rw_exit(&brtvd->bv_mos_entries_lock); + + brt_rlock(spa); + } + brt_unlock(spa); +} + void brt_unload(spa_t *spa) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 249f878a1..5690f8afa 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -850,12 +850,15 @@ dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size) return (err); /* - * Chunk the requests (16 indirects worth) so that we can be interrupted + * Chunk the requests (16 indirects worth) so that we can be + * interrupted. Prefetch at least SPA_MAXBLOCKSIZE at a time + * to better utilize pools with smaller block sizes. */ uint64_t chunksize; if (dn->dn_indblkshift) { uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1); chunksize = (nbps * 16) << dn->dn_datablkshift; + chunksize = MAX(chunksize, SPA_MAXBLOCKSIZE); } else { chunksize = dn->dn_datablksz; } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0ec6e1e23..1b2392aea 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -212,6 +212,8 @@ #include #include #include +#include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -4276,13 +4278,11 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) spa_t *spa; int32_t type; - /* - * Currently, only ZPOOL_PREFETCH_DDT is supported - */ - if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 || - type != ZPOOL_PREFETCH_DDT) { + if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0) + return (EINVAL); + + if (type != ZPOOL_PREFETCH_DDT && type != ZPOOL_PREFETCH_BRT) return (EINVAL); - } error = spa_open(poolname, &spa, FTAG); if (error != 0) @@ -4290,10 +4290,17 @@ zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) hrtime_t start_time = gethrtime(); - ddt_prefetch_all(spa); - - zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name, - (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + if (type == ZPOOL_PREFETCH_DDT) { + ddt_prefetch_all(spa); + zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", + spa->spa_name, + (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + } else { + brt_prefetch_all(spa); + zfs_dbgmsg("pool '%s': loaded brt into ARC in %llu ms", + spa->spa_name, + (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + } spa_close(spa, FTAG); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9f531411f..a69c6e3c8 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -215,7 +215,7 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zpool_prefetch] -tests = ['zpool_prefetch_001_pos'] +tests = ['zpool_prefetch_001_pos', 'zpool_prefetch_002_pos'] tags = ['functional', 'cli_root', 'zpool_prefetch'] [tests/functional/cli_root/zfs_destroy] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 678c01b58..23284234c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1217,6 +1217,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_prefetch/cleanup.ksh \ functional/cli_root/zpool_prefetch/setup.ksh \ functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \ + functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh \ functional/cli_root/zpool_reguid/cleanup.ksh \ functional/cli_root/zpool_reguid/setup.ksh \ functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh index 8ef3a66ad..fd446e46e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh @@ -42,6 +42,15 @@ verify_runnable "both" log_assert "'zpool prefetch -t ddt ' can successfully load the DDT for a pool." +DATASET=$TESTPOOL/ddt + +function cleanup +{ + datasetexists $DATASET && destroy_dataset $DATASET -f +} + +log_onexit cleanup + function getddtstats { typeset -n gds=$1 @@ -75,9 +84,8 @@ log_must zpool prefetch -t ddt $TESTPOOL # Build up the deduplicated dataset. This consists of creating enough files # to generate a reasonable size DDT for testing purposes. -DATASET=$TESTPOOL/ddt log_must zfs create -o compression=off -o dedup=on $DATASET -MNTPOINT=$(get_prop mountpoint $TESTPOOL/ddt) +MNTPOINT=$(get_prop mountpoint $DATASET) log_note "Generating dataset ..." typeset -i i=0 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh new file mode 100755 index 000000000..f34f8c36e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_002_pos.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool prefetch -t brt ' can successfully load a pool's BRT on demand. +# 'zpool prefetch ' without -t prefetches both DDT and BRT. +# +# STRATEGY: +# 1. Create a dataset with block cloning enabled. +# 2. Create files and clone them to populate the BRT. +# 3. Export and import the pool to flush caches. +# 4. Use zpool prefetch -t brt to load BRT. +# 5. Test zpool prefetch without -t to prefetch all types. +# + +verify_runnable "both" + +if ! command -v clonefile > /dev/null ; then + log_unsupported "clonefile program required to test block cloning" +fi + +log_assert "'zpool prefetch' can successfully load BRT and prefetch all types" + +DATASET=$TESTPOOL/brt + +function cleanup +{ + datasetexists $DATASET && destroy_dataset $DATASET -f +} + +log_onexit cleanup +log_must zfs create $DATASET +MNTPOINT=$(get_prop mountpoint $DATASET) + +log_note "Generating cloned blocks for BRT ..." + +# Create source file +log_must dd if=/dev/urandom of=$MNTPOINT/source bs=1M count=100 + +# Create clones using clonefile +typeset -i i=0 +while (( i < 50 )); do + log_must clonefile -f $MNTPOINT/source $MNTPOINT/clone.$i + ((i += 1)) +done + +sync_pool $TESTPOOL + +# Verify BRT has entries (non-zero saved space) +brt_saved=$(zpool get -Hp -o value bclone_saved $TESTPOOL) +log_note "BRT saved space: $brt_saved" +log_must test "$brt_saved" -gt "0" + +# Export/import to flush caches +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# Test BRT prefetch - verify command succeeds +# Note: BRT does not expose cache statistics like DDT, so we can only +# verify the prefetch command completes successfully +log_must zpool prefetch -t brt $TESTPOOL + +# Test prefetch without -t (should prefetch all types including BRT) +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zpool prefetch $TESTPOOL + +log_pass "'zpool prefetch' successfully loads BRT and all types"