diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am index 576b1ba2c..6c629ff5a 100644 --- a/cmd/zstream/Makefile.am +++ b/cmd/zstream/Makefile.am @@ -8,6 +8,7 @@ zstream_SOURCES = \ %D%/zstream.c \ %D%/zstream.h \ %D%/zstream_decompress.c \ + %D%/zstream_drop_record.c \ %D%/zstream_dump.c \ %D%/zstream_recompress.c \ %D%/zstream_redup.c \ diff --git a/cmd/zstream/zstream.c b/cmd/zstream/zstream.c index d3417c69c..f1a2fa757 100644 --- a/cmd/zstream/zstream.c +++ b/cmd/zstream/zstream.c @@ -43,6 +43,8 @@ zstream_usage(void) "\n" "\tzstream decompress [-v] [OBJECT,OFFSET[,TYPE]] ...\n" "\n" + "\tzstream drop_record [-v] [OBJECT,OFFSET] ...\n" + "\n" "\tzstream recompress [ -l level] TYPE\n" "\n" "\tzstream token resume_token\n" @@ -68,6 +70,8 @@ main(int argc, char *argv[]) return (zstream_do_dump(argc - 1, argv + 1)); } else if (strcmp(subcommand, "decompress") == 0) { return (zstream_do_decompress(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "drop_record") == 0) { + return (zstream_do_drop_record(argc - 1, argv + 1)); } else if (strcmp(subcommand, "recompress") == 0) { return (zstream_do_recompress(argc - 1, argv + 1)); } else if (strcmp(subcommand, "token") == 0) { diff --git a/cmd/zstream/zstream.h b/cmd/zstream/zstream.h index fb46e5462..ba4c7456d 100644 --- a/cmd/zstream/zstream.h +++ b/cmd/zstream/zstream.h @@ -28,6 +28,7 @@ extern "C" { extern int zstream_do_redup(int, char *[]); extern int zstream_do_dump(int, char *[]); extern int zstream_do_decompress(int argc, char *argv[]); +extern int zstream_do_drop_record(int argc, char *argv[]); extern int zstream_do_recompress(int argc, char *argv[]); extern int zstream_do_token(int, char *[]); extern void zstream_usage(void); diff --git a/cmd/zstream/zstream_drop_record.c b/cmd/zstream/zstream_drop_record.c new file mode 100644 index 000000000..b6895bd52 --- /dev/null +++ b/cmd/zstream/zstream_drop_record.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2026 ConnectWise. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_fletcher.h" +#include "zstream.h" +#include "zstream_util.h" + +int +zstream_do_drop_record(int argc, char *argv[]) +{ + const int KEYSIZE = 64; + int bufsz = SPA_MAXBLOCKSIZE; + char *buf = safe_malloc(bufsz); + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + zio_cksum_t stream_cksum; + int c; + boolean_t verbose = B_FALSE; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc < 0) + zstream_usage(); + + if (hcreate(argc) == 0) + errx(1, "hcreate"); + for (int i = 0; i < argc; i++) { + uint64_t object, offset; + char *obj_str; + char *offset_str; + char *key; + char *end; + + obj_str = strsep(&argv[i], ","); + if (argv[i] == NULL) { + zstream_usage(); + exit(2); + } + errno = 0; + object = strtoull(obj_str, &end, 0); + if (errno || *end != '\0') + errx(1, "invalid value for object"); + offset_str = strsep(&argv[i], ","); + offset = strtoull(offset_str, &end, 0); + if (errno || *end != '\0') + errx(1, "invalid value for offset"); + + if (asprintf(&key, "%llu,%llu", (u_longlong_t)object, + (u_longlong_t)offset) < 0) { + err(1, "asprintf"); + } + ENTRY e = {.key = key}; + ENTRY *p; + + p = hsearch(e, ENTER); + if (p == NULL) + errx(1, "hsearch"); + p->data = (void*)(intptr_t)B_TRUE; + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + "Error: The send stream is a binary format " + "and can not be read from a\n" + "terminal. Standard input must be redirected.\n"); + exit(1); + } + + fletcher_4_init(); + int begin = 0; + boolean_t seen = B_FALSE; + while (sfread(drr, sizeof (*drr), stdin) != 0) { + struct drr_write *drrw; + uint64_t payload_size = 0; + + /* + * We need to regenerate the checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + memset(&drr->drr_u.drr_checksum.drr_checksum, 0, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + + switch (drr->drr_type) { + case DRR_BEGIN: + { + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + VERIFY0(begin++); + seen = B_TRUE; + + uint32_t sz = drr->drr_payloadlen; + + VERIFY3U(sz, <=, 1U << 28); + + if (sz != 0) { + if (sz > bufsz) { + buf = realloc(buf, sz); + if (buf == NULL) + err(1, "realloc"); + bufsz = sz; + } + (void) sfread(buf, sz, stdin); + } + payload_size = sz; + break; + } + case DRR_END: + { + struct drr_end *drre = &drr->drr_u.drr_end; + /* + * We would prefer to just check --begin == 0, but + * replication streams have an end of stream END + * record, so we must avoid tripping it. + */ + VERIFY3B(seen, ==, B_TRUE); + begin--; + /* + * Use the recalculated checksum, unless this is + * the END record of a stream package, which has + * no checksum. + */ + if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) + drre->drr_checksum = stream_cksum; + break; + } + + case DRR_OBJECT: + { + struct drr_object *drro = &drr->drr_u.drr_object; + VERIFY3S(begin, ==, 1); + + if (drro->drr_bonuslen > 0) { + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + (void) sfread(buf, payload_size, stdin); + } + break; + } + + case DRR_SPILL: + { + struct drr_spill *drrs = &drr->drr_u.drr_spill; + VERIFY3S(begin, ==, 1); + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + (void) sfread(buf, payload_size, stdin); + break; + } + + case DRR_WRITE_BYREF: + VERIFY3S(begin, ==, 1); + fprintf(stderr, + "Deduplicated streams are not supported\n"); + exit(1); + break; + + case DRR_WRITE: + { + VERIFY3S(begin, ==, 1); + drrw = &thedrr.drr_u.drr_write; + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + ENTRY *p; + char key[KEYSIZE]; + + snprintf(key, KEYSIZE, "%llu,%llu", + (u_longlong_t)drrw->drr_object, + (u_longlong_t)drrw->drr_offset); + ENTRY e = {.key = key}; + + (void) sfread(buf, payload_size, stdin); + p = hsearch(e, FIND); + if (p == NULL) { + /* + * Dump the contents of the block unaltered + */ + } else { + /* + * Read and discard the block + */ + if (verbose) + fprintf(stderr, + "Dropping WRITE record for object " + "%llu offset %llu\n", + (u_longlong_t)drrw->drr_object, + (u_longlong_t)drrw->drr_offset); + continue; + } + break; + } + + case DRR_WRITE_EMBEDDED: + { + ENTRY *p; + char key[KEYSIZE]; + + VERIFY3S(begin, ==, 1); + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; + payload_size = + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); + + snprintf(key, KEYSIZE, "%llu,%llu", + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset); + ENTRY e = {.key = key}; + + (void) sfread(buf, payload_size, stdin); + p = hsearch(e, FIND); + if (p == NULL) { + /* + * Dump the contents of the block unaltered + */ + } else { + /* + * Read and discard the block + */ + if (verbose) + fprintf(stderr, + "Dropping WRITE_EMBEDDED record for" + " object %llu offset %llu\n", + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset); + continue; + } + break; + } + + case DRR_FREEOBJECTS: + case DRR_FREE: + case DRR_OBJECT_RANGE: + VERIFY3S(begin, ==, 1); + break; + + default: + (void) fprintf(stderr, "INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + + if (feof(stdout)) { + fprintf(stderr, "Error: unexpected end-of-file\n"); + exit(1); + } + if (ferror(stdout)) { + fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + + /* + * We need to recalculate the checksum, and it needs to be + * initially zero to do that. BEGIN records don't have + * a checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + memset(&drr->drr_u.drr_checksum.drr_checksum, 0, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + if (dump_record(drr, buf, payload_size, + &stream_cksum, STDOUT_FILENO) != 0) + break; + if (drr->drr_type == DRR_END) { + /* + * Typically the END record is either the last + * thing in the stream, or it is followed + * by a BEGIN record (which also zeros the checksum). + * However, a stream package ends with two END + * records. The last END record's checksum starts + * from zero. + */ + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + } + } + free(buf); + fletcher_4_fini(); + hdestroy(); + + return (0); +} diff --git a/man/man8/zstream.8 b/man/man8/zstream.8 index 5b3d063bc..e38bde33c 100644 --- a/man/man8/zstream.8 +++ b/man/man8/zstream.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2020 by Delphix. All rights reserved. .\" -.Dd November 10, 2022 +.Dd February 20, 2026 .Dt ZSTREAM 8 .Os . @@ -38,6 +38,10 @@ .Op Fl v .Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \&, Ns Ar type Ns ... .Nm +.Cm drop_record +.Op Fl v +.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& Ns ... +.Nm .Cm redup .Op Fl v .Ar file @@ -127,6 +131,21 @@ Print summary of decompressed records. .El .It Xo .Nm +.Cm drop_record +.Op Fl v +.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& ... +.Xc +Drop selected records from a ZFS send stream provided on standard input, +specified by object number and byte offset. +Only WRITE and WRITE_EMBEDDED are records are supported, currently. +The repaired stream will be written to standard output. +.Bl -tag -width "-v" +.It Fl v +Verbose. +Print summary of dropped records. +.El +.It Xo +.Nm .Cm redup .Op Fl v .Ar file @@ -178,7 +197,7 @@ non-default level is desired). .El . .Sh EXAMPLES -Heal a dataset that was corrupted due to OpenZFS bug #12762. +.Ss Recovering from OpenZFS bug #12762 First, determine which records are corrupt. That cannot be done automatically; it requires information beyond ZFS's metadata. @@ -193,8 +212,24 @@ then run this command: .No # Nm zfs Ar send Fl c Ar … | Nm zstream decompress Ar 128,0,lz4 | \ Nm zfs recv Ar … .Ed +. +.Ss Recovering from OpenZFS bug #18239 +The bogus records typically have an absurdly large offset, and can be seen with +a command like +.Nm zdb Fl ddddd Ar dataset Ar object +or +.Nm zstream Ar dump Fl v . +To recover, send the dataset and use +.Nm zstream +to drop the bogus record, then receive into a new dataset. +.Bd -literal +.No # Nm zfs Ar send Ar ... | Nm zstream drop_record Ar 3545761,18446744073709486080 | \ +Nm zfs recv Ar ... +.Ed .Sh SEE ALSO +.Xr zdb 8 , .Xr zfs 8 , .Xr zfs-receive 8 , .Xr zfs-send 8 , .Lk https://github.com/openzfs/zfs/issues/12762 +.Lk https://github.com/openzfs/zfs/issues/18239 diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 0ad81335d..e5ded9343 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -998,7 +998,8 @@ tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', - 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid', + 'send-wR_encrypted_zvol', 'send-zstream_drop_record', + 'send_partial_dataset', 'send_invalid', 'send_large_blocks_incremental', 'send_large_blocks_initial', 'send_large_microzap_incremental', 'send_large_microzap_transitive', 'send_doall', 'send_raw_spill_block', 'send_raw_ashift', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index d87dc8697..9d60ce3f2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -2094,6 +2094,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/rsend/send_realloc_files.ksh \ functional/rsend/send_spill_block.ksh \ functional/rsend/send-wR_encrypted_zvol.ksh \ + functional/rsend/send-zstream_drop_record.ksh \ functional/rsend/setup.ksh \ functional/scrub_mirror/cleanup.ksh \ functional/scrub_mirror/scrub_mirror_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/rsend/send-zstream_drop_record.ksh b/tests/zfs-tests/tests/functional/rsend/send-zstream_drop_record.ksh new file mode 100755 index 000000000..a2e810fd4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send-zstream_drop_record.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2026 by ConnectWise. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib +. $STF_SUITE/include/math.shlib + +# +# Description: +# Verify that "zstream drop_record" can remove a record from a stream +# +# Strategy: +# 1. Create a file containing multiple records, both full size and embedded. +# 2. Send the dataset and drop some records +# 3. Verify the dropped records are no longer present +# 4. Verify that "zfs recv" can still receive the dataset. + +verify_runnable "both" + +log_assert "Verify zstream drop_record correctly drops records." +log_onexit cleanup_pool $POOL2 + +typeset sendfs=$POOL2/fs +typeset recvfs=$POOL2/fs2 +typeset stream=$BACKDIR/stream +typeset filtered=$BACKDIR/filtered +typeset dump=$BACKDIR/dump + +log_must zfs create -o compress=lz4 $sendfs +typeset dir=$(get_prop mountpoint $sendfs) + +truncate -s 1m $dir/full_records +# Create some full size records +log_must dd if=/dev/urandom of=$dir/full_records conv=notrunc bs=128k count=2 + +# Create a file with an embedded record. I don't know how to create a file +# with two embedded records. +recsize=16384 +# For lz4, this method works for blocks up to 16k, but not larger +[[ $recsize -eq $((32 * 1024)) ]] && break +if is_illumos; then + log_must mkholes -h 0:$((recsize - 8)) -d $((recsize - 8)):8 \ + $dir/embedded_records +else + log_must truncate -s 16384 $dir/embedded_records + log_must dd if=/dev/urandom of=$dir/embedded_records \ + seek=$((recsize - 8)) bs=1 count=8 conv=notrunc +fi + +log_must zfs snapshot $sendfs@snap +typeset inode1=$(get_objnum $dir/full_records) +typeset inode2=$(get_objnum $dir/embedded_records) + +# Verify that the requested records, and only them, were dropped +log_must eval "zfs send -ce $sendfs@snap > $stream" +log_must eval "zstream drop_record $inode1,131072 $inode2,0 < $stream > $filtered" +log_must eval "zstream dump -v < $filtered > $dump" +log_must grep -qE "^WRITE object = $inode1\>.*offset = 0" $dump +log_mustnot grep -qE "^WRITE object = $inode1\>.*offset = 131072" $dump +log_mustnot grep -qE "^WRITE_EMBEDDED object = $inode2\>.*offset = 0" $dump + +# Verify that the stream can be received +log_must eval "zfs recv $recvfs < $stream" + +log_pass "zstream drop_record correctly drops records."