Add TXG timestamp database

This feature enables tracking of when TXGs are committed to disk,
providing an estimated timestamp for each TXG.

With this information, it becomes possible to perform scrubs based
on specific date ranges, improving the granularity of data
management and recovery operations.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Reviewed-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Signed-off-by: Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #16853
This commit is contained in:
Mariusz Zaborski 2025-08-06 19:31:21 +02:00 committed by GitHub
parent c3496b5cc6
commit 894edd084e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 736 additions and 10 deletions

View File

@ -513,8 +513,8 @@ get_usage(zpool_help_t idx)
return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | <pool> "
"[<device> ...]>\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-e | -s | -p | -C] [-w] <-a | "
"<pool> [<pool> ...]>\n"));
return (gettext("\tscrub [-e | -s | -p | -C | -E | -S] [-w] "
"<-a | <pool> [<pool> ...]>\n"));
case HELP_RESILVER:
return (gettext("\tresilver <pool> ...\n"));
case HELP_TRIM:
@ -8359,6 +8359,8 @@ zpool_do_reopen(int argc, char **argv)
typedef struct scrub_cbdata {
int cb_type;
pool_scrub_cmd_t cb_scrub_cmd;
time_t cb_date_start;
time_t cb_date_end;
} scrub_cbdata_t;
static boolean_t
@ -8402,8 +8404,8 @@ scrub_callback(zpool_handle_t *zhp, void *data)
return (1);
}
err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
err = zpool_scan_range(zhp, cb->cb_type, cb->cb_scrub_cmd,
cb->cb_date_start, cb->cb_date_end);
if (err == 0 && zpool_has_checkpoint(zhp) &&
cb->cb_type == POOL_SCAN_SCRUB) {
(void) printf(gettext("warning: will not scrub state that "
@ -8421,10 +8423,34 @@ wait_callback(zpool_handle_t *zhp, void *data)
return (zpool_wait(zhp, *act));
}
static time_t
date_string_to_sec(const char *timestr, boolean_t rounding)
{
struct tm tm = {0};
int adjustment = rounding ? 1 : 0;
/* Allow mktime to determine timezone. */
tm.tm_isdst = -1;
if (strptime(timestr, "%Y-%m-%d %H:%M", &tm) == NULL) {
if (strptime(timestr, "%Y-%m-%d", &tm) == NULL) {
fprintf(stderr, gettext("Failed to parse the date.\n"));
usage(B_FALSE);
}
adjustment *= 24 * 60 * 60;
} else {
adjustment *= 60;
}
return (mktime(&tm) + adjustment);
}
/*
* zpool scrub [-e | -s | -p | -C] [-w] <pool> ...
* zpool scrub [-e | -s | -p | -C | -E | -S] [-w] <pool> ...
*
* -e Only scrub blocks in the error log.
* -E End date of scrub.
* -S Start date of scrub.
* -s Stop. Stops any in-progress scrub.
* -p Pause. Pause in-progress scrub.
* -w Wait. Blocks until scrub has completed.
@ -8440,6 +8466,7 @@ zpool_do_scrub(int argc, char **argv)
cb.cb_type = POOL_SCAN_SCRUB;
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
cb.cb_date_start = cb.cb_date_end = 0;
boolean_t is_error_scrub = B_FALSE;
boolean_t is_pause = B_FALSE;
@ -8448,7 +8475,7 @@ zpool_do_scrub(int argc, char **argv)
boolean_t scrub_all = B_FALSE;
/* check options */
while ((c = getopt(argc, argv, "aspweC")) != -1) {
while ((c = getopt(argc, argv, "aspweCE:S:")) != -1) {
switch (c) {
case 'a':
scrub_all = B_TRUE;
@ -8456,9 +8483,19 @@ zpool_do_scrub(int argc, char **argv)
case 'e':
is_error_scrub = B_TRUE;
break;
case 'E':
/*
* Round the date. It's better to scrub more data than
* less. This also makes the date inclusive.
*/
cb.cb_date_end = date_string_to_sec(optarg, B_TRUE);
break;
case 's':
is_stop = B_TRUE;
break;
case 'S':
cb.cb_date_start = date_string_to_sec(optarg, B_FALSE);
break;
case 'p':
is_pause = B_TRUE;
break;
@ -8506,6 +8543,19 @@ zpool_do_scrub(int argc, char **argv)
}
}
if ((cb.cb_date_start != 0 || cb.cb_date_end != 0) &&
cb.cb_scrub_cmd != POOL_SCRUB_NORMAL) {
(void) fprintf(stderr, gettext("invalid option combination: "
"start/end date is available only with normal scrub\n"));
usage(B_FALSE);
}
if (cb.cb_date_start != 0 && cb.cb_date_end != 0 &&
cb.cb_date_start > cb.cb_date_end) {
(void) fprintf(stderr, gettext("invalid arguments: "
"end date has to be later than start date\n"));
usage(B_FALSE);
}
if (wait && (cb.cb_type == POOL_SCAN_NONE ||
cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {
(void) fprintf(stderr, gettext("invalid option combination: "
@ -8546,6 +8596,7 @@ zpool_do_resilver(int argc, char **argv)
cb.cb_type = POOL_SCAN_RESILVER;
cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
cb.cb_date_start = cb.cb_date_end = 0;
/* check options */
while ((c = getopt(argc, argv, "")) != -1) {

View File

@ -10,6 +10,7 @@ COMMON_H = \
cityhash.h \
zfeature_common.h \
zfs_comutil.h \
zfs_crrd.h \
zfs_deleg.h \
zfs_fletcher.h \
zfs_namecheck.h \

View File

@ -302,6 +302,8 @@ typedef struct initialize_cbdata {
* Functions to manipulate pool and vdev state
*/
_LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
_LIBZFS_H int zpool_scan_range(zpool_handle_t *, pool_scan_func_t,
pool_scrub_cmd_t, time_t, time_t);
_LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *);
_LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);

View File

@ -414,6 +414,9 @@ typedef struct dmu_buf {
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
#define DMU_POOL_TXG_LOG_TIME_MINUTES "com.klaraystems:txg_log_time:minutes"
#define DMU_POOL_TXG_LOG_TIME_DAYS "com.klaraystems:txg_log_time:days"
#define DMU_POOL_TXG_LOG_TIME_MONTHS "com.klaraystems:txg_log_time:months"
/*
* Allocate an object from this objset. The range of object numbers

View File

@ -55,6 +55,8 @@
#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>
#include "zfs_crrd.h"
#ifdef __cplusplus
extern "C" {
#endif
@ -344,6 +346,12 @@ struct spa {
spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
zthr_t *spa_checkpoint_discard_zthr;
kmutex_t spa_txg_log_time_lock; /* for spa_txg_log_time */
dbrrd_t spa_txg_log_time;
uint64_t spa_last_noted_txg;
uint64_t spa_last_noted_txg_time;
uint64_t spa_last_flush_txg_time;
space_map_t *spa_syncing_log_sm; /* current log space map */
avl_tree_t spa_sm_logs_by_txg;
kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */

75
include/zfs_crrd.h Normal file
View File

@ -0,0 +1,75 @@
// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024 Klara Inc.
*
* This software was developed by
* Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
* Fred Weigel <fred.weigel@klarasystems.com>
* under sponsorship from Wasabi Technology, Inc. and Klara Inc.
*/
#ifndef _CRRD_H_
#define _CRRD_H_
#define RRD_MAX_ENTRIES 256
#define RRD_ENTRY_SIZE sizeof (uint64_t)
#define RRD_STRUCT_ELEM (sizeof (rrd_t) / RRD_ENTRY_SIZE)
typedef enum {
DBRRD_FLOOR,
DBRRD_CEILING
} dbrrd_rounding_t;
typedef struct {
uint64_t rrdd_time;
uint64_t rrdd_txg;
} rrd_data_t;
typedef struct {
uint64_t rrd_head; /* head (beginning) */
uint64_t rrd_tail; /* tail (end) */
uint64_t rrd_length;
rrd_data_t rrd_entries[RRD_MAX_ENTRIES];
} rrd_t;
typedef struct {
rrd_t dbr_minutes;
rrd_t dbr_days;
rrd_t dbr_months;
} dbrrd_t;
size_t rrd_len(rrd_t *rrd);
const rrd_data_t *rrd_entry(rrd_t *r, size_t i);
rrd_data_t *rrd_tail_entry(rrd_t *rrd);
uint64_t rrd_tail(rrd_t *rrd);
uint64_t rrd_get(rrd_t *rrd, size_t i);
void rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg);
void dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg);
uint64_t dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rouding);
#endif

View File

@ -574,6 +574,7 @@
<elf-symbol name='zpool_reguid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_reopen_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_scan_range' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -6946,6 +6947,14 @@
<parameter type-id='b51cf3c2' name='cmd'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_scan_range' mangled-name='zpool_scan_range' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_scan_range'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='7313fbe2' name='func'/>
<parameter type-id='b51cf3c2' name='cmd'/>
<parameter type-id='c9d12d66' name='date_start'/>
<parameter type-id='c9d12d66' name='date_end'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_find_vdev_by_physpath' mangled-name='zpool_find_vdev_by_physpath' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_find_vdev_by_physpath'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='80f4b756' name='ppath'/>

View File

@ -2773,7 +2773,13 @@ out:
* Scan the pool.
*/
int
zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) {
return (zpool_scan_range(zhp, func, cmd, 0, 0));
}
int
zpool_scan_range(zpool_handle_t *zhp, pool_scan_func_t func,
pool_scrub_cmd_t cmd, time_t date_start, time_t date_end)
{
char errbuf[ERRBUFLEN];
int err;
@ -2782,6 +2788,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
nvlist_t *args = fnvlist_alloc();
fnvlist_add_uint64(args, "scan_type", (uint64_t)func);
fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd);
fnvlist_add_uint64(args, "scan_date_start", (uint64_t)date_start);
fnvlist_add_uint64(args, "scan_date_end", (uint64_t)date_end);
err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL);
fnvlist_free(args);

View File

@ -177,6 +177,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/zfs_byteswap.c \
module/zfs/zfs_chksum.c \
module/zfs/zfs_debug_common.c \
module/zfs/zfs_crrd.c \
module/zfs/zfs_fm.c \
module/zfs/zfs_fuid.c \
module/zfs/zfs_ratelimit.c \

View File

@ -2246,6 +2246,21 @@ Defer frees starting in this pass.
Maximum memory used for prefetching a checkpoint's space map on each
vdev while discarding the checkpoint.
.
.It Sy zfs_spa_note_txg_time Ns = Ns Sy 600 Pq uint
This parameter defines, in seconds, how often the TXG time database will record
a new TXG if it has changed.
After the specified time interval has passed, and if the TXG number has changed,
the new value is recorded in the database.
These timestamps can later be used for more granular operations, such as
scrubbing.
.
.It Sy zfs_spa_flush_txg_time Ns = Ns Sy 600 Pq uint
This parameter defines, in seconds, how often the ZFS will flush
the TXG time database to disk.
It ensures that the data is actually written to persistent storage, which helps
preserve the database in case of unexpected shutdown.
The database is also automatically flushed during the export sequence.
.
.It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint
Only allow small data blocks to be allocated on the special and dedup vdev
types when the available free space percentage on these vdevs exceeds this

View File

@ -28,7 +28,7 @@
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
.\"
.Dd November 18, 2024
.Dd December 11, 2024
.Dt ZPOOL-SCRUB 8
.Os
.
@ -40,6 +40,8 @@
.Cm scrub
.Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns
.Op Fl w
.Op Fl S Ar date
.Op Fl E Ar date
.Fl a Ns | Ns Ar pool Ns
.
.Sh DESCRIPTION
@ -125,6 +127,44 @@ resilvering, nor can it be run when a regular scrub is paused.
Continue scrub from last saved txg (see zpool
.Sy last_scrubbed_txg
property).
.It Fl S Ar date , Fl E Ar date
Allows specifying the date range for blocks created between these dates.
.Bl -bullet -compact -offset indent
.It
.Fl S
Defines a start date.
If not specified, scrubbing begins from the start of the pool's
existence.
.It
.Fl E
Defines an end date.
If not specified, scrubbing continues up to the most recent data.
.El
The provided date should be in the format:
.Dq YYYY-MM-DD HH:MM .
Where:
.Bl -bullet -compact -offset indent
.It
.Dq YYYY
is the year.
.It
.Dq MM
is the numeric representation of the month.
.It
.Dq DD
is the day of the month.
.It
.Dq HH
is the hour.
.It
.Dq MM
is the minutes.
.El
The hour and minutes parameters can be omitted.
The time should be provided in machine local time zone.
Specifying dates prior to enabling this feature will result in scrubbing
starting from the date the pool was created.
If the time was moved backward manually the data range may become inaccurate.
.El
.Sh EXAMPLES
.Ss Example 1

View File

@ -406,6 +406,7 @@ ZFS_OBJS := \
zfs_byteswap.o \
zfs_chksum.o \
zfs_debug_common.o \
zfs_crrd.o \
zfs_fm.o \
zfs_fuid.o \
zfs_impl.o \

View File

@ -217,6 +217,7 @@ SRCS+= abd_os.c \
vdev_label_os.c \
zfs_acl.c \
zfs_ctldir.c \
zfs_crrd.c \
zfs_debug.c \
zfs_dir.c \
zfs_file_os.c \

View File

@ -100,6 +100,7 @@
#include <sys/vmsystm.h>
#endif /* _KERNEL */
#include "zfs_crrd.h"
#include "zfs_prop.h"
#include "zfs_comutil.h"
#include <cityhash.h>
@ -310,6 +311,41 @@ static int zfs_livelist_condense_zthr_cancel = 0;
*/
static int zfs_livelist_condense_new_alloc = 0;
/*
* Time variable to decide how often the txg should be added into the
* database (in seconds).
* The smallest available resolution is in minutes, which means an update occurs
* each time we reach `spa_note_txg_time` and the txg has changed. We provide
* a 256-slot ring buffer for minute-level resolution. The number is limited by
* the size of the structure we use and the maximum amount of bytes we can write
* into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
* 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
* high-resolution data.
*
* The user can decrease `spa_note_txg_time` to increase resolution within
* a day, at the cost of retaining fewer days of data. Alternatively, increasing
* the interval allows storing data over a longer period, but with lower
* frequency.
*
* This parameter does not affect the daily or monthly databases, as those only
* store one record per day and per month, respectively.
*/
static uint_t spa_note_txg_time = 10 * 60;
/*
* How often flush txg database to a disk (in seconds).
* We flush data every time we write to it, making it the most reliable option.
* Since this happens every 10 minutes, it shouldn't introduce any noticeable
* overhead for the system. In case of failure, we will always have an
* up-to-date version of the database.
*
* The user can adjust the flush interval to a lower value, but it probably
* doesn't make sense to flush more often than the database is updated.
* The user can also increase the interval if they're concerned about the
* performance of writing the entire database to disk.
*/
static uint_t spa_flush_txg_time = 10 * 60;
/*
* ==========================================================================
* SPA properties routines
@ -2040,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa)
}
}
static void
spa_sync_time_logger(spa_t *spa, uint64_t txg)
{
uint64_t curtime;
dmu_tx_t *tx;
if (!spa_writeable(spa)) {
return;
}
curtime = gethrestime_sec();
if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) {
return;
}
if (txg > spa->spa_last_noted_txg) {
spa->spa_last_noted_txg_time = curtime;
spa->spa_last_noted_txg = txg;
mutex_enter(&spa->spa_txg_log_time_lock);
dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
mutex_exit(&spa->spa_txg_log_time_lock);
}
if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
return;
}
spa->spa_last_flush_txg_time = curtime;
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_minutes, tx));
VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_days, tx));
VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_months, tx));
dmu_tx_commit(tx);
}
static void
spa_unload_sync_time_logger(spa_t *spa)
{
uint64_t txg;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
txg = dmu_tx_get_txg(tx);
spa->spa_last_noted_txg_time = 0;
spa->spa_last_flush_txg_time = 0;
spa_sync_time_logger(spa, txg);
dmu_tx_commit(tx);
}
static void
spa_load_txg_log_time(spa_t *spa)
{
int error;
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_minutes);
if (error != 0 && error != ENOENT) {
spa_load_note(spa, "unable to load a txg time database with "
"minute resolution [error=%d]", error);
}
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_days);
if (error != 0 && error != ENOENT) {
spa_load_note(spa, "unable to load a txg time database with "
"day resolution [error=%d]", error);
}
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
&spa->spa_txg_log_time.dbr_months);
if (error != 0 && error != ENOENT) {
spa_load_note(spa, "unable to load a txg time database with "
"month resolution [error=%d]", error);
}
}
static boolean_t
spa_should_sync_time_logger_on_unload(spa_t *spa)
{
if (!spa_writeable(spa))
return (B_FALSE);
if (!spa->spa_sync_on)
return (B_FALSE);
if (spa_state(spa) != POOL_STATE_EXPORTED)
return (B_FALSE);
if (spa->spa_last_noted_txg == 0)
return (B_FALSE);
return (B_TRUE);
}
/*
* Opposite of spa_load().
*/
@ -2061,6 +2202,9 @@ spa_unload(spa_t *spa)
* we delay the final TXGs beyond what spa_final_txg is set at.
*/
if (spa->spa_final_txg == UINT64_MAX) {
if (spa_should_sync_time_logger_on_unload(spa))
spa_unload_sync_time_logger(spa);
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@ -4717,6 +4861,9 @@ spa_ld_get_props(spa_t *spa)
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/* Load time log */
spa_load_txg_log_time(spa);
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
@ -7140,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa_config_exit(spa, SCL_ALL, FTAG);
}
if (spa_should_sync_time_logger_on_unload(spa))
spa_unload_sync_time_logger(spa);
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@ -10190,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg)
*/
brt_pending_apply(spa, txg);
spa_sync_time_logger(spa, txg);
/*
* Lock out configuration changes.
*/
@ -10232,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg)
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@ -11105,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
"How frequently TXG timestamps are stored internally (in seconds)");
ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
"How frequently the TXG timestamps database should be flushed "
"to disk (in seconds)");
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,

View File

@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@ -903,6 +904,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_vdev_top_lock);
mutex_destroy(&spa->spa_feat_stats_lock);
mutex_destroy(&spa->spa_activities_lock);
mutex_destroy(&spa->spa_txg_log_time_lock);
kmem_free(spa, sizeof (spa_t));
}

227
module/zfs/zfs_crrd.c Normal file
View File

@ -0,0 +1,227 @@
// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2024 Klara Inc.
*
* This software was developed by
* Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
* Fred Weigel <fred.weigel@klarasystems.com>
* under sponsorship from Wasabi Technology, Inc. and Klara Inc.
*/
/*
* This file implements a round-robin database that stores timestamps and txg
* numbers. Due to limited space, we use a round-robin approach, where
* the oldest records are overwritten when there is no longer enough room.
* This is a best-effort mechanism, and the database should be treated as
* an approximation. Consider this before consuming it.
*
* The database is linear, meaning we assume each new entry is newer than the
* ones already stored. Because of this, if time is manipulated, the database
* will only accept records that are newer than the existing ones.
* (For example, jumping 10 years into the future and then back can lead to
* situation when for 10 years we wont write anything to database)
*
* All times stored in the database use UTC, which makes it easy to convert to
* and from local time.
*
* Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro).
* This limit comes from the maximum size of a ZAP object, where we store the
* binary blob.
*
* We've split the database into three smaller ones.
* The `minute database` provides high resolution (default: every 10 minutes),
* but only covers approximately 1.5 days. This gives a detailed view of recent
* activity, useful, for example, when performing a scrub of the last hour.
* The `daily database` records one txg per day. With 256 entries, it retains
* roughly 8 months of data. This allows users to scrub or analyze txgs across
* a range of days.
* The `monthly database` stores one record per month, giving approximately
* 21 years of history.
* All these calculations assume the worst-case scenario: the pool is always
* online and actively written to.
*
* A potential source of confusion is that the database does not store data
* while the pool is offline, leading to potential gaps in timeline. Also,
* the database contains no records from before this feature was enabled.
* Both, upon reflection, are expected.
*/
#include <sys/zfs_context.h>
#include "zfs_crrd.h"
rrd_data_t *
rrd_tail_entry(rrd_t *rrd)
{
size_t n;
if (rrd_len(rrd) == 0)
return (NULL);
if (rrd->rrd_tail == 0)
n = RRD_MAX_ENTRIES - 1;
else
n = rrd->rrd_tail - 1;
return (&rrd->rrd_entries[n]);
}
uint64_t
rrd_tail(rrd_t *rrd)
{
const rrd_data_t *tail;
tail = rrd_tail_entry(rrd);
return (tail == NULL ? 0 : tail->rrdd_time);
}
/*
* Return length of data in the rrd.
* rrd_get works from 0..rrd_len()-1.
*/
size_t
rrd_len(rrd_t *rrd)
{
return (rrd->rrd_length);
}
const rrd_data_t *
rrd_entry(rrd_t *rrd, size_t i)
{
size_t n;
if (i >= rrd_len(rrd)) {
return (0);
}
n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES;
return (&rrd->rrd_entries[n]);
}
uint64_t
rrd_get(rrd_t *rrd, size_t i)
{
const rrd_data_t *data = rrd_entry(rrd, i);
return (data == NULL ? 0 : data->rrdd_txg);
}
/* Add value to database. */
void
rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg)
{
rrd_data_t *tail;
tail = rrd_tail_entry(rrd);
if (tail != NULL && tail->rrdd_time == time) {
if (tail->rrdd_txg < txg) {
tail->rrdd_txg = txg;
} else {
return;
}
}
rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time;
rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg;
rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES;
if (rrd->rrd_length < RRD_MAX_ENTRIES) {
rrd->rrd_length++;
} else {
rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES;
}
}
void
dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
{
hrtime_t daydiff, monthdiff, minutedif;
minutedif = time - rrd_tail(&db->dbr_minutes);
daydiff = time - rrd_tail(&db->dbr_days);
monthdiff = time - rrd_tail(&db->dbr_months);
if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
rrd_add(&db->dbr_months, time, txg);
else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
rrd_add(&db->dbr_days, time, txg);
else if (minutedif >= 0)
rrd_add(&db->dbr_minutes, time, txg);
}
/*
* We could do a binary search here, but the routine isn't frequently
* called and the data is small so we stick to a simple loop.
*/
static const rrd_data_t *
rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding)
{
const rrd_data_t *data = NULL;
for (size_t i = 0; i < rrd_len(rrd); i++) {
const rrd_data_t *cur = rrd_entry(rrd, i);
if (rounding == DBRRD_FLOOR) {
if (tv < cur->rrdd_time) {
break;
}
data = cur;
} else {
/* DBRRD_CEILING */
if (tv <= cur->rrdd_time) {
data = cur;
break;
}
}
}
return (data);
}
static const rrd_data_t *
dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
{
if (r1 == NULL)
return (r2);
if (r2 == NULL)
return (r1);
return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
}
uint64_t
dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding)
{
const rrd_data_t *data, *dm, *dd, *dy;
data = NULL;
dm = rrd_query(&r->dbr_minutes, tv, rounding);
dd = rrd_query(&r->dbr_days, tv, rounding);
dy = rrd_query(&r->dbr_months, tv, rounding);
data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy);
return (data == NULL ? 0 : data->rrdd_txg);
}

View File

@ -1704,6 +1704,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
{"scan_type", DATA_TYPE_UINT64, 0},
{"scan_command", DATA_TYPE_UINT64, 0},
{"scan_date_start", DATA_TYPE_UINT64, ZK_OPTIONAL},
{"scan_date_end", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
static int
@ -1712,6 +1714,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
spa_t *spa;
int error;
uint64_t scan_type, scan_cmd;
uint64_t date_start, date_end;
if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
return (SET_ERROR(EINVAL));
@ -1721,6 +1724,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
if (scan_cmd >= POOL_SCRUB_FLAGS_END)
return (SET_ERROR(EINVAL));
if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0)
date_start = 0;
if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0)
date_end = 0;
if ((error = spa_open(poolname, &spa, FTAG)) != 0)
return (error);
@ -1732,7 +1740,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
error = spa_scan_range(spa, scan_type,
spa_get_last_scrubbed_txg(spa), 0);
} else {
error = spa_scan(spa, scan_type);
uint64_t txg_start, txg_end;
txg_start = txg_end = 0;
if (date_start != 0 || date_end != 0) {
mutex_enter(&spa->spa_txg_log_time_lock);
if (date_start != 0) {
txg_start = dbrrd_query(&spa->spa_txg_log_time,
date_start, DBRRD_FLOOR);
}
if (date_end != 0) {
txg_end = dbrrd_query(&spa->spa_txg_log_time,
date_end, DBRRD_CEILING);
}
mutex_exit(&spa->spa_txg_log_time_lock);
}
error = spa_scan_range(spa, scan_type, txg_start, txg_end);
}
spa_close(spa, FTAG);

View File

@ -545,7 +545,8 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
'zpool_scrub_multiple_pools',
'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos',
'zpool_scrub_date_range_001']
tags = ['functional', 'cli_root', 'zpool_scrub']
[tests/functional/cli_root/zpool_set]

View File

@ -87,6 +87,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation
SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit
SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data
SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata
SPA_NOTE_TXG_TIME spa.note_txg_time spa_note_txg_time
TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min
TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip
TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch

View File

@ -1244,6 +1244,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \
functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \

View File

@ -0,0 +1,94 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright 2025 Klara, Inc.
# Copyright 2025 Mariusz Zaborski <oshogbo@FreeBSD.org>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
#
# DESCRIPTION:
# Verify that the date range scrub only scrubs the files that were
# created/modified within a given time slot.
#
# STRATEGY:
# 1. Write a file.
# 2. Force a sync of everything via export/import.
# 3. Wait for one minute.
# 4. Repeat steps 1, 2, and 3 four two times.
# 5. Inject checksum errors into all 3 files.
# 6. Scrub the date range for the first file.
# 7. Verify that the first file is scrubbed.
# 8. Verify that newer files are not scrubbed.
# 9. Repeat steps 68 for each of the remaining 2 files.
#
verify_runnable "global"
function cleanup
{
log_must zinject -c all
rm -f $TESTDIR/*_file
log_must restore_tunable SPA_NOTE_TXG_TIME
}
log_onexit cleanup
log_assert "Verifiy scrub, -E, and -S show expected status."
log_must save_tunable SPA_NOTE_TXG_TIME
log_must set_tunable64 SPA_NOTE_TXG_TIME 30
typeset -a date_list
for i in `seq 0 2`; do
log_must sleep 60
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
date_list+=("$(date '+%Y-%m-%d %H:%M')")
log_must file_write -o create -f"$TESTDIR/${i}_file" \
-b 512 -c 2048 -dR
log_must sleep 60
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
date_list+=("$(date '+%Y-%m-%d %H:%M')")
done
for i in `seq 0 2`; do
log_must zinject -t data -e checksum -f 100 $TESTDIR/${i}_file
done
for i in `seq 0 2`; do
log_must zpool scrub -w -S "${date_list[$((i * 2))]}" -E "${date_list[$((i * 2 + 1))]}" $TESTPOOL
log_must eval "zpool status -v $TESTPOOL | grep '${i}_file'"
for j in `seq 0 2`; do
if [ $i == $j ]; then
continue
fi
log_mustnot eval "zpool status -v $TESTPOOL | grep '${j}_file'"
done
done
log_pass "Verified scrub, -E, and -S show expected status."