diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 23cc590cc..d401e0879 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -513,8 +513,8 @@ get_usage(zpool_help_t idx) return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | " "[ ...]>\n")); case HELP_SCRUB: - return (gettext("\tscrub [-e | -s | -p | -C] [-w] <-a | " - " [ ...]>\n")); + return (gettext("\tscrub [-e | -s | -p | -C | -E | -S] [-w] " + "<-a | [ ...]>\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: @@ -8359,6 +8359,8 @@ zpool_do_reopen(int argc, char **argv) typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; + time_t cb_date_start; + time_t cb_date_end; } scrub_cbdata_t; static boolean_t @@ -8402,8 +8404,8 @@ scrub_callback(zpool_handle_t *zhp, void *data) return (1); } - err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); - + err = zpool_scan_range(zhp, cb->cb_type, cb->cb_scrub_cmd, + cb->cb_date_start, cb->cb_date_end); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " @@ -8421,10 +8423,34 @@ wait_callback(zpool_handle_t *zhp, void *data) return (zpool_wait(zhp, *act)); } +static time_t +date_string_to_sec(const char *timestr, boolean_t rounding) +{ + struct tm tm = {0}; + int adjustment = rounding ? 1 : 0; + + /* Allow mktime to determine timezone. */ + tm.tm_isdst = -1; + + if (strptime(timestr, "%Y-%m-%d %H:%M", &tm) == NULL) { + if (strptime(timestr, "%Y-%m-%d", &tm) == NULL) { + fprintf(stderr, gettext("Failed to parse the date.\n")); + usage(B_FALSE); + } + adjustment *= 24 * 60 * 60; + } else { + adjustment *= 60; + } + + return (mktime(&tm) + adjustment); +} + /* - * zpool scrub [-e | -s | -p | -C] [-w] ... + * zpool scrub [-e | -s | -p | -C | -E | -S] [-w] ... * * -e Only scrub blocks in the error log. + * -E End date of scrub. + * -S Start date of scrub. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. @@ -8440,6 +8466,7 @@ zpool_do_scrub(int argc, char **argv) cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + cb.cb_date_start = cb.cb_date_end = 0; boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; @@ -8448,7 +8475,7 @@ zpool_do_scrub(int argc, char **argv) boolean_t scrub_all = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "aspweC")) != -1) { + while ((c = getopt(argc, argv, "aspweCE:S:")) != -1) { switch (c) { case 'a': scrub_all = B_TRUE; @@ -8456,9 +8483,19 @@ zpool_do_scrub(int argc, char **argv) case 'e': is_error_scrub = B_TRUE; break; + case 'E': + /* + * Round the date. It's better to scrub more data than + * less. This also makes the date inclusive. + */ + cb.cb_date_end = date_string_to_sec(optarg, B_TRUE); + break; case 's': is_stop = B_TRUE; break; + case 'S': + cb.cb_date_start = date_string_to_sec(optarg, B_FALSE); + break; case 'p': is_pause = B_TRUE; break; @@ -8506,6 +8543,19 @@ zpool_do_scrub(int argc, char **argv) } } + if ((cb.cb_date_start != 0 || cb.cb_date_end != 0) && + cb.cb_scrub_cmd != POOL_SCRUB_NORMAL) { + (void) fprintf(stderr, gettext("invalid option combination: " + "start/end date is available only with normal scrub\n")); + usage(B_FALSE); + } + if (cb.cb_date_start != 0 && cb.cb_date_end != 0 && + cb.cb_date_start > cb.cb_date_end) { + (void) fprintf(stderr, gettext("invalid arguments: " + "end date has to be later than start date\n")); + usage(B_FALSE); + } + if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " @@ -8546,6 +8596,7 @@ zpool_do_resilver(int argc, char **argv) cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + cb.cb_date_start = cb.cb_date_end = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { diff --git a/include/Makefile.am b/include/Makefile.am index a0427ae6a..7588cd0ae 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -10,6 +10,7 @@ COMMON_H = \ cityhash.h \ zfeature_common.h \ zfs_comutil.h \ + zfs_crrd.h \ zfs_deleg.h \ zfs_fletcher.h \ zfs_namecheck.h \ diff --git a/include/libzfs.h b/include/libzfs.h index 187d7b449..3fcdc176a 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -302,6 +302,8 @@ typedef struct initialize_cbdata { * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +_LIBZFS_H int zpool_scan_range(zpool_handle_t *, pool_scan_func_t, + pool_scrub_cmd_t, time_t, time_t); _LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0b2e443a4..7dc6daaf0 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -414,6 +414,9 @@ typedef struct dmu_buf { #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" +#define DMU_POOL_TXG_LOG_TIME_MINUTES "com.klaraystems:txg_log_time:minutes" +#define DMU_POOL_TXG_LOG_TIME_DAYS "com.klaraystems:txg_log_time:days" +#define DMU_POOL_TXG_LOG_TIME_MONTHS "com.klaraystems:txg_log_time:months" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index a596235ce..07a959db3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -55,6 +55,8 @@ #include #include +#include "zfs_crrd.h" + #ifdef __cplusplus extern "C" { #endif @@ -344,6 +346,12 @@ struct spa { spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; + kmutex_t spa_txg_log_time_lock; /* for spa_txg_log_time */ + dbrrd_t spa_txg_log_time; + uint64_t spa_last_noted_txg; + uint64_t spa_last_noted_txg_time; + uint64_t spa_last_flush_txg_time; + space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ diff --git a/include/zfs_crrd.h b/include/zfs_crrd.h new file mode 100644 index 000000000..ba192a206 --- /dev/null +++ b/include/zfs_crrd.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski + * Fred Weigel + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ + +#ifndef _CRRD_H_ +#define _CRRD_H_ + +#define RRD_MAX_ENTRIES 256 + +#define RRD_ENTRY_SIZE sizeof (uint64_t) +#define RRD_STRUCT_ELEM (sizeof (rrd_t) / RRD_ENTRY_SIZE) + +typedef enum { + DBRRD_FLOOR, + DBRRD_CEILING +} dbrrd_rounding_t; + +typedef struct { + uint64_t rrdd_time; + uint64_t rrdd_txg; +} rrd_data_t; + +typedef struct { + uint64_t rrd_head; /* head (beginning) */ + uint64_t rrd_tail; /* tail (end) */ + uint64_t rrd_length; + + rrd_data_t rrd_entries[RRD_MAX_ENTRIES]; +} rrd_t; + +typedef struct { + rrd_t dbr_minutes; + rrd_t dbr_days; + rrd_t dbr_months; +} dbrrd_t; + +size_t rrd_len(rrd_t *rrd); + +const rrd_data_t *rrd_entry(rrd_t *r, size_t i); +rrd_data_t *rrd_tail_entry(rrd_t *rrd); +uint64_t rrd_tail(rrd_t *rrd); +uint64_t rrd_get(rrd_t *rrd, size_t i); + +void rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg); + +void dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg); +uint64_t dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rouding); + +#endif diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index bd2ab6468..37d22402e 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -574,6 +574,7 @@ + @@ -6946,6 +6947,14 @@ + + + + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b6fb153c4..10b42720e 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2773,7 +2773,13 @@ out: * Scan the pool. */ int -zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { + return (zpool_scan_range(zhp, func, cmd, 0, 0)); +} + +int +zpool_scan_range(zpool_handle_t *zhp, pool_scan_func_t func, + pool_scrub_cmd_t cmd, time_t date_start, time_t date_end) { char errbuf[ERRBUFLEN]; int err; @@ -2782,6 +2788,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, "scan_type", (uint64_t)func); fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); + fnvlist_add_uint64(args, "scan_date_start", (uint64_t)date_start); + fnvlist_add_uint64(args, "scan_date_end", (uint64_t)date_end); err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); fnvlist_free(args); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 5cdb6a3eb..aeacc595b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -177,6 +177,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zfs_byteswap.c \ module/zfs/zfs_chksum.c \ module/zfs/zfs_debug_common.c \ + module/zfs/zfs_crrd.c \ module/zfs/zfs_fm.c \ module/zfs/zfs_fuid.c \ module/zfs/zfs_ratelimit.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e00b1848b..fa37c7cdb 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2246,6 +2246,21 @@ Defer frees starting in this pass. Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . +.It Sy zfs_spa_note_txg_time Ns = Ns Sy 600 Pq uint +This parameter defines, in seconds, how often the TXG time database will record +a new TXG if it has changed. +After the specified time interval has passed, and if the TXG number has changed, +the new value is recorded in the database. +These timestamps can later be used for more granular operations, such as +scrubbing. +. +.It Sy zfs_spa_flush_txg_time Ns = Ns Sy 600 Pq uint +This parameter defines, in seconds, how often the ZFS will flush +the TXG time database to disk. +It ensures that the data is actually written to persistent storage, which helps +preserve the database in case of unexpected shutdown. +The database is also automatically flushed during the export sequence. +. .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 9b4cf132c..0ecf8bd38 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -28,7 +28,7 @@ .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP. .\" -.Dd November 18, 2024 +.Dd December 11, 2024 .Dt ZPOOL-SCRUB 8 .Os . @@ -40,6 +40,8 @@ .Cm scrub .Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns .Op Fl w +.Op Fl S Ar date +.Op Fl E Ar date .Fl a Ns | Ns Ar pool Ns … . .Sh DESCRIPTION @@ -125,6 +127,44 @@ resilvering, nor can it be run when a regular scrub is paused. Continue scrub from last saved txg (see zpool .Sy last_scrubbed_txg property). +.It Fl S Ar date , Fl E Ar date +Allows specifying the date range for blocks created between these dates. +.Bl -bullet -compact -offset indent +.It +.Fl S +Defines a start date. +If not specified, scrubbing begins from the start of the pool's +existence. +.It +.Fl E +Defines an end date. +If not specified, scrubbing continues up to the most recent data. +.El +The provided date should be in the format: +.Dq YYYY-MM-DD HH:MM . +Where: +.Bl -bullet -compact -offset indent +.It +.Dq YYYY +is the year. +.It +.Dq MM +is the numeric representation of the month. +.It +.Dq DD +is the day of the month. +.It +.Dq HH +is the hour. +.It +.Dq MM +is the minutes. +.El +The hour and minutes parameters can be omitted. +The time should be provided in machine local time zone. +Specifying dates prior to enabling this feature will result in scrubbing +starting from the date the pool was created. +If the time was moved backward manually the data range may become inaccurate. .El .Sh EXAMPLES .Ss Example 1 diff --git a/module/Kbuild.in b/module/Kbuild.in index ece603fee..3d6f288fa 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -406,6 +406,7 @@ ZFS_OBJS := \ zfs_byteswap.o \ zfs_chksum.o \ zfs_debug_common.o \ + zfs_crrd.o \ zfs_fm.o \ zfs_fuid.o \ zfs_impl.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 7e7c3db73..3ba38c43f 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -217,6 +217,7 @@ SRCS+= abd_os.c \ vdev_label_os.c \ zfs_acl.c \ zfs_ctldir.c \ + zfs_crrd.c \ zfs_debug.c \ zfs_dir.c \ zfs_file_os.c \ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 31f152a80..bbf474ed6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -100,6 +100,7 @@ #include #endif /* _KERNEL */ +#include "zfs_crrd.h" #include "zfs_prop.h" #include "zfs_comutil.h" #include @@ -310,6 +311,41 @@ static int zfs_livelist_condense_zthr_cancel = 0; */ static int zfs_livelist_condense_new_alloc = 0; +/* + * Time variable to decide how often the txg should be added into the + * database (in seconds). + * The smallest available resolution is in minutes, which means an update occurs + * each time we reach `spa_note_txg_time` and the txg has changed. We provide + * a 256-slot ring buffer for minute-level resolution. The number is limited by + * the size of the structure we use and the maximum amount of bytes we can write + * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately + * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of + * high-resolution data. + * + * The user can decrease `spa_note_txg_time` to increase resolution within + * a day, at the cost of retaining fewer days of data. Alternatively, increasing + * the interval allows storing data over a longer period, but with lower + * frequency. + * + * This parameter does not affect the daily or monthly databases, as those only + * store one record per day and per month, respectively. + */ +static uint_t spa_note_txg_time = 10 * 60; + +/* + * How often flush txg database to a disk (in seconds). + * We flush data every time we write to it, making it the most reliable option. + * Since this happens every 10 minutes, it shouldn't introduce any noticeable + * overhead for the system. In case of failure, we will always have an + * up-to-date version of the database. + * + * The user can adjust the flush interval to a lower value, but it probably + * doesn't make sense to flush more often than the database is updated. + * The user can also increase the interval if they're concerned about the + * performance of writing the entire database to disk. + */ +static uint_t spa_flush_txg_time = 10 * 60; + /* * ========================================================================== * SPA properties routines @@ -2040,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa) } } +static void +spa_sync_time_logger(spa_t *spa, uint64_t txg) +{ + uint64_t curtime; + dmu_tx_t *tx; + + if (!spa_writeable(spa)) { + return; + } + curtime = gethrestime_sec(); + if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) { + return; + } + + if (txg > spa->spa_last_noted_txg) { + spa->spa_last_noted_txg_time = curtime; + spa->spa_last_noted_txg = txg; + + mutex_enter(&spa->spa_txg_log_time_lock); + dbrrd_add(&spa->spa_txg_log_time, curtime, txg); + mutex_exit(&spa->spa_txg_log_time_lock); + } + + if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) { + return; + } + spa->spa_last_flush_txg_time = curtime; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months, tx)); + dmu_tx_commit(tx); +} + +static void +spa_unload_sync_time_logger(spa_t *spa) +{ + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + + txg = dmu_tx_get_txg(tx); + spa->spa_last_noted_txg_time = 0; + spa->spa_last_flush_txg_time = 0; + spa_sync_time_logger(spa, txg); + + dmu_tx_commit(tx); +} + +static void +spa_load_txg_log_time(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "minute resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "day resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "month resolution [error=%d]", error); + } +} + +static boolean_t +spa_should_sync_time_logger_on_unload(spa_t *spa) +{ + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (spa->spa_last_noted_txg == 0) + return (B_FALSE); + + return (B_TRUE); +} + + /* * Opposite of spa_load(). */ @@ -2061,6 +2202,9 @@ spa_unload(spa_t *spa) * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -4717,6 +4861,9 @@ spa_ld_get_props(spa_t *spa) if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* Load time log */ + spa_load_txg_log_time(spa); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -7140,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa_config_exit(spa, SCL_ALL, FTAG); } + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -10190,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg) */ brt_pending_apply(spa, txg); + spa_sync_time_logger(spa, txg); + /* * Lock out configuration changes. */ @@ -10232,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + @@ -11105,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); +ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW, + "How frequently TXG timestamps are stored internally (in seconds)"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW, + "How frequently the TXG timestamps database should be flushed " + "to disk (in seconds)"); + #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index d2ba1f954..2eba8362a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -903,6 +904,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); + mutex_destroy(&spa->spa_txg_log_time_lock); kmem_free(spa, sizeof (spa_t)); } diff --git a/module/zfs/zfs_crrd.c b/module/zfs/zfs_crrd.c new file mode 100644 index 000000000..f9267ed41 --- /dev/null +++ b/module/zfs/zfs_crrd.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski + * Fred Weigel + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ +/* + * This file implements a round-robin database that stores timestamps and txg + * numbers. Due to limited space, we use a round-robin approach, where + * the oldest records are overwritten when there is no longer enough room. + * This is a best-effort mechanism, and the database should be treated as + * an approximation. Consider this before consuming it. + * + * The database is linear, meaning we assume each new entry is newer than the + * ones already stored. Because of this, if time is manipulated, the database + * will only accept records that are newer than the existing ones. + * (For example, jumping 10 years into the future and then back can lead to + * situation when for 10 years we wont write anything to database) + * + * All times stored in the database use UTC, which makes it easy to convert to + * and from local time. + * + * Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro). + * This limit comes from the maximum size of a ZAP object, where we store the + * binary blob. + * + * We've split the database into three smaller ones. + * The `minute database` provides high resolution (default: every 10 minutes), + * but only covers approximately 1.5 days. This gives a detailed view of recent + * activity, useful, for example, when performing a scrub of the last hour. + * The `daily database` records one txg per day. With 256 entries, it retains + * roughly 8 months of data. This allows users to scrub or analyze txgs across + * a range of days. + * The `monthly database` stores one record per month, giving approximately + * 21 years of history. + * All these calculations assume the worst-case scenario: the pool is always + * online and actively written to. + * + * A potential source of confusion is that the database does not store data + * while the pool is offline, leading to potential gaps in timeline. Also, + * the database contains no records from before this feature was enabled. + * Both, upon reflection, are expected. + */ +#include + +#include "zfs_crrd.h" + +rrd_data_t * +rrd_tail_entry(rrd_t *rrd) +{ + size_t n; + + if (rrd_len(rrd) == 0) + return (NULL); + + if (rrd->rrd_tail == 0) + n = RRD_MAX_ENTRIES - 1; + else + n = rrd->rrd_tail - 1; + + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_tail(rrd_t *rrd) +{ + const rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + + return (tail == NULL ? 0 : tail->rrdd_time); +} + +/* + * Return length of data in the rrd. + * rrd_get works from 0..rrd_len()-1. + */ +size_t +rrd_len(rrd_t *rrd) +{ + + return (rrd->rrd_length); +} + +const rrd_data_t * +rrd_entry(rrd_t *rrd, size_t i) +{ + size_t n; + + if (i >= rrd_len(rrd)) { + return (0); + } + + n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES; + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_get(rrd_t *rrd, size_t i) +{ + const rrd_data_t *data = rrd_entry(rrd, i); + + return (data == NULL ? 0 : data->rrdd_txg); +} + +/* Add value to database. */ +void +rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg) +{ + rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + if (tail != NULL && tail->rrdd_time == time) { + if (tail->rrdd_txg < txg) { + tail->rrdd_txg = txg; + } else { + return; + } + } + + rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time; + rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg; + + rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES; + + if (rrd->rrd_length < RRD_MAX_ENTRIES) { + rrd->rrd_length++; + } else { + rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES; + } +} + +void +dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) +{ + hrtime_t daydiff, monthdiff, minutedif; + + minutedif = time - rrd_tail(&db->dbr_minutes); + daydiff = time - rrd_tail(&db->dbr_days); + monthdiff = time - rrd_tail(&db->dbr_months); + + if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60)) + rrd_add(&db->dbr_months, time, txg); + else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60)) + rrd_add(&db->dbr_days, time, txg); + else if (minutedif >= 0) + rrd_add(&db->dbr_minutes, time, txg); +} + +/* + * We could do a binary search here, but the routine isn't frequently + * called and the data is small so we stick to a simple loop. + */ +static const rrd_data_t * +rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data = NULL; + + for (size_t i = 0; i < rrd_len(rrd); i++) { + const rrd_data_t *cur = rrd_entry(rrd, i); + + if (rounding == DBRRD_FLOOR) { + if (tv < cur->rrdd_time) { + break; + } + data = cur; + } else { + /* DBRRD_CEILING */ + if (tv <= cur->rrdd_time) { + data = cur; + break; + } + } + } + + return (data); +} + +static const rrd_data_t * +dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) +{ + + if (r1 == NULL) + return (r2); + if (r2 == NULL) + return (r1); + + return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2); +} + +uint64_t +dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data, *dm, *dd, *dy; + + data = NULL; + dm = rrd_query(&r->dbr_minutes, tv, rounding); + dd = rrd_query(&r->dbr_days, tv, rounding); + dy = rrd_query(&r->dbr_months, tv, rounding); + + data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy); + + return (data == NULL ? 0 : data->rrdd_txg); +} diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3a413f4a7..dcb71229f 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1704,6 +1704,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { {"scan_type", DATA_TYPE_UINT64, 0}, {"scan_command", DATA_TYPE_UINT64, 0}, + {"scan_date_start", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"scan_date_end", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int @@ -1712,6 +1714,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) spa_t *spa; int error; uint64_t scan_type, scan_cmd; + uint64_t date_start, date_end; if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) return (SET_ERROR(EINVAL)); @@ -1721,6 +1724,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (scan_cmd >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0) + date_start = 0; + if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0) + date_end = 0; + if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); @@ -1732,7 +1740,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = spa_scan_range(spa, scan_type, spa_get_last_scrubbed_txg(spa), 0); } else { - error = spa_scan(spa, scan_type); + uint64_t txg_start, txg_end; + + txg_start = txg_end = 0; + if (date_start != 0 || date_end != 0) { + mutex_enter(&spa->spa_txg_log_time_lock); + if (date_start != 0) { + txg_start = dbrrd_query(&spa->spa_txg_log_time, + date_start, DBRRD_FLOOR); + } + + if (date_end != 0) { + txg_end = dbrrd_query(&spa->spa_txg_log_time, + date_end, DBRRD_CEILING); + } + mutex_exit(&spa->spa_txg_log_time_lock); + } + + error = spa_scan_range(spa, scan_type, txg_start, txg_end); } spa_close(spa, FTAG); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 16869d397..deca3c05b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -545,7 +545,8 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', 'zpool_scrub_multiple_pools', 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', - 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos'] + 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos', + 'zpool_scrub_date_range_001'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 98e14ad97..e273c9f85 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -87,6 +87,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NOTE_TXG_TIME spa.note_txg_time spa_note_txg_time TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 505fe3daf..5ab28b2d6 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1244,6 +1244,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh new file mode 100755 index 000000000..7f5f8052c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2025 Klara, Inc. +# Copyright 2025 Mariusz Zaborski +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify that the date range scrub only scrubs the files that were +# created/modified within a given time slot. +# +# STRATEGY: +# 1. Write a file. +# 2. Force a sync of everything via export/import. +# 3. Wait for one minute. +# 4. Repeat steps 1, 2, and 3 four two times. +# 5. Inject checksum errors into all 3 files. +# 6. Scrub the date range for the first file. +# 7. Verify that the first file is scrubbed. +# 8. Verify that newer files are not scrubbed. +# 9. Repeat steps 6–8 for each of the remaining 2 files. +# + +verify_runnable "global" + +function cleanup +{ + log_must zinject -c all + rm -f $TESTDIR/*_file + log_must restore_tunable SPA_NOTE_TXG_TIME +} + +log_onexit cleanup + +log_assert "Verifiy scrub, -E, and -S show expected status." + +log_must save_tunable SPA_NOTE_TXG_TIME +log_must set_tunable64 SPA_NOTE_TXG_TIME 30 + +typeset -a date_list +for i in `seq 0 2`; do + log_must sleep 60 + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + date_list+=("$(date '+%Y-%m-%d %H:%M')") + + log_must file_write -o create -f"$TESTDIR/${i}_file" \ + -b 512 -c 2048 -dR + + log_must sleep 60 + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + date_list+=("$(date '+%Y-%m-%d %H:%M')") +done + +for i in `seq 0 2`; do + log_must zinject -t data -e checksum -f 100 $TESTDIR/${i}_file +done + +for i in `seq 0 2`; do + log_must zpool scrub -w -S "${date_list[$((i * 2))]}" -E "${date_list[$((i * 2 + 1))]}" $TESTPOOL + log_must eval "zpool status -v $TESTPOOL | grep '${i}_file'" + for j in `seq 0 2`; do + if [ $i == $j ]; then + continue + fi + log_mustnot eval "zpool status -v $TESTPOOL | grep '${j}_file'" + done +done + +log_pass "Verified scrub, -E, and -S show expected status."