zstream: add a drop_record subcommand

It can be used to drop extraneous records in a send stream caused by a
corrupt dataset, as in issue #18239.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alan Somers <asomers@gmail.com>
Sponsored by:	ConnectWise
Closes #18275
This commit is contained in:
Alan Somers 2026-03-12 16:08:58 -06:00 committed by GitHub
parent 7f65e04abd
commit 753f1e1e21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 449 additions and 3 deletions

View File

@ -8,6 +8,7 @@ zstream_SOURCES = \
%D%/zstream.c \
%D%/zstream.h \
%D%/zstream_decompress.c \
%D%/zstream_drop_record.c \
%D%/zstream_dump.c \
%D%/zstream_recompress.c \
%D%/zstream_redup.c \

View File

@ -43,6 +43,8 @@ zstream_usage(void)
"\n"
"\tzstream decompress [-v] [OBJECT,OFFSET[,TYPE]] ...\n"
"\n"
"\tzstream drop_record [-v] [OBJECT,OFFSET] ...\n"
"\n"
"\tzstream recompress [ -l level] TYPE\n"
"\n"
"\tzstream token resume_token\n"
@ -68,6 +70,8 @@ main(int argc, char *argv[])
return (zstream_do_dump(argc - 1, argv + 1));
} else if (strcmp(subcommand, "decompress") == 0) {
return (zstream_do_decompress(argc - 1, argv + 1));
} else if (strcmp(subcommand, "drop_record") == 0) {
return (zstream_do_drop_record(argc - 1, argv + 1));
} else if (strcmp(subcommand, "recompress") == 0) {
return (zstream_do_recompress(argc - 1, argv + 1));
} else if (strcmp(subcommand, "token") == 0) {

View File

@ -28,6 +28,7 @@ extern "C" {
extern int zstream_do_redup(int, char *[]);
extern int zstream_do_dump(int, char *[]);
extern int zstream_do_decompress(int argc, char *argv[]);
extern int zstream_do_drop_record(int argc, char *argv[]);
extern int zstream_do_recompress(int argc, char *argv[]);
extern int zstream_do_token(int, char *[]);
extern void zstream_usage(void);

View File

@ -0,0 +1,324 @@
// SPDX-License-Identifier: CDDL-1.0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2026 ConnectWise. All rights reserved.
* Use is subject to license terms.
*/
#include <err.h>
#include <search.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/zfs_ioctl.h>
#include <sys/zio_checksum.h>
#include <sys/zstd/zstd.h>
#include "zfs_fletcher.h"
#include "zstream.h"
#include "zstream_util.h"
int
zstream_do_drop_record(int argc, char *argv[])
{
const int KEYSIZE = 64;
int bufsz = SPA_MAXBLOCKSIZE;
char *buf = safe_malloc(bufsz);
dmu_replay_record_t thedrr;
dmu_replay_record_t *drr = &thedrr;
zio_cksum_t stream_cksum;
int c;
boolean_t verbose = B_FALSE;
while ((c = getopt(argc, argv, "v")) != -1) {
switch (c) {
case 'v':
verbose = B_TRUE;
break;
case '?':
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
zstream_usage();
break;
}
}
argc -= optind;
argv += optind;
if (argc < 0)
zstream_usage();
if (hcreate(argc) == 0)
errx(1, "hcreate");
for (int i = 0; i < argc; i++) {
uint64_t object, offset;
char *obj_str;
char *offset_str;
char *key;
char *end;
obj_str = strsep(&argv[i], ",");
if (argv[i] == NULL) {
zstream_usage();
exit(2);
}
errno = 0;
object = strtoull(obj_str, &end, 0);
if (errno || *end != '\0')
errx(1, "invalid value for object");
offset_str = strsep(&argv[i], ",");
offset = strtoull(offset_str, &end, 0);
if (errno || *end != '\0')
errx(1, "invalid value for offset");
if (asprintf(&key, "%llu,%llu", (u_longlong_t)object,
(u_longlong_t)offset) < 0) {
err(1, "asprintf");
}
ENTRY e = {.key = key};
ENTRY *p;
p = hsearch(e, ENTER);
if (p == NULL)
errx(1, "hsearch");
p->data = (void*)(intptr_t)B_TRUE;
}
if (isatty(STDIN_FILENO)) {
(void) fprintf(stderr,
"Error: The send stream is a binary format "
"and can not be read from a\n"
"terminal. Standard input must be redirected.\n");
exit(1);
}
fletcher_4_init();
int begin = 0;
boolean_t seen = B_FALSE;
while (sfread(drr, sizeof (*drr), stdin) != 0) {
struct drr_write *drrw;
uint64_t payload_size = 0;
/*
* We need to regenerate the checksum.
*/
if (drr->drr_type != DRR_BEGIN) {
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
sizeof (drr->drr_u.drr_checksum.drr_checksum));
}
switch (drr->drr_type) {
case DRR_BEGIN:
{
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
VERIFY0(begin++);
seen = B_TRUE;
uint32_t sz = drr->drr_payloadlen;
VERIFY3U(sz, <=, 1U << 28);
if (sz != 0) {
if (sz > bufsz) {
buf = realloc(buf, sz);
if (buf == NULL)
err(1, "realloc");
bufsz = sz;
}
(void) sfread(buf, sz, stdin);
}
payload_size = sz;
break;
}
case DRR_END:
{
struct drr_end *drre = &drr->drr_u.drr_end;
/*
* We would prefer to just check --begin == 0, but
* replication streams have an end of stream END
* record, so we must avoid tripping it.
*/
VERIFY3B(seen, ==, B_TRUE);
begin--;
/*
* Use the recalculated checksum, unless this is
* the END record of a stream package, which has
* no checksum.
*/
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
drre->drr_checksum = stream_cksum;
break;
}
case DRR_OBJECT:
{
struct drr_object *drro = &drr->drr_u.drr_object;
VERIFY3S(begin, ==, 1);
if (drro->drr_bonuslen > 0) {
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
(void) sfread(buf, payload_size, stdin);
}
break;
}
case DRR_SPILL:
{
struct drr_spill *drrs = &drr->drr_u.drr_spill;
VERIFY3S(begin, ==, 1);
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
(void) sfread(buf, payload_size, stdin);
break;
}
case DRR_WRITE_BYREF:
VERIFY3S(begin, ==, 1);
fprintf(stderr,
"Deduplicated streams are not supported\n");
exit(1);
break;
case DRR_WRITE:
{
VERIFY3S(begin, ==, 1);
drrw = &thedrr.drr_u.drr_write;
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
ENTRY *p;
char key[KEYSIZE];
snprintf(key, KEYSIZE, "%llu,%llu",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
ENTRY e = {.key = key};
(void) sfread(buf, payload_size, stdin);
p = hsearch(e, FIND);
if (p == NULL) {
/*
* Dump the contents of the block unaltered
*/
} else {
/*
* Read and discard the block
*/
if (verbose)
fprintf(stderr,
"Dropping WRITE record for object "
"%llu offset %llu\n",
(u_longlong_t)drrw->drr_object,
(u_longlong_t)drrw->drr_offset);
continue;
}
break;
}
case DRR_WRITE_EMBEDDED:
{
ENTRY *p;
char key[KEYSIZE];
VERIFY3S(begin, ==, 1);
struct drr_write_embedded *drrwe =
&drr->drr_u.drr_write_embedded;
payload_size =
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
snprintf(key, KEYSIZE, "%llu,%llu",
(u_longlong_t)drrwe->drr_object,
(u_longlong_t)drrwe->drr_offset);
ENTRY e = {.key = key};
(void) sfread(buf, payload_size, stdin);
p = hsearch(e, FIND);
if (p == NULL) {
/*
* Dump the contents of the block unaltered
*/
} else {
/*
* Read and discard the block
*/
if (verbose)
fprintf(stderr,
"Dropping WRITE_EMBEDDED record for"
" object %llu offset %llu\n",
(u_longlong_t)drrwe->drr_object,
(u_longlong_t)drrwe->drr_offset);
continue;
}
break;
}
case DRR_FREEOBJECTS:
case DRR_FREE:
case DRR_OBJECT_RANGE:
VERIFY3S(begin, ==, 1);
break;
default:
(void) fprintf(stderr, "INVALID record type 0x%x\n",
drr->drr_type);
/* should never happen, so assert */
assert(B_FALSE);
}
if (feof(stdout)) {
fprintf(stderr, "Error: unexpected end-of-file\n");
exit(1);
}
if (ferror(stdout)) {
fprintf(stderr, "Error while reading file: %s\n",
strerror(errno));
exit(1);
}
/*
* We need to recalculate the checksum, and it needs to be
* initially zero to do that. BEGIN records don't have
* a checksum.
*/
if (drr->drr_type != DRR_BEGIN) {
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
sizeof (drr->drr_u.drr_checksum.drr_checksum));
}
if (dump_record(drr, buf, payload_size,
&stream_cksum, STDOUT_FILENO) != 0)
break;
if (drr->drr_type == DRR_END) {
/*
* Typically the END record is either the last
* thing in the stream, or it is followed
* by a BEGIN record (which also zeros the checksum).
* However, a stream package ends with two END
* records. The last END record's checksum starts
* from zero.
*/
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
}
}
free(buf);
fletcher_4_fini();
hdestroy();
return (0);
}

View File

@ -21,7 +21,7 @@
.\"
.\" Copyright (c) 2020 by Delphix. All rights reserved.
.\"
.Dd November 10, 2022
.Dd February 20, 2026
.Dt ZSTREAM 8
.Os
.
@ -38,6 +38,10 @@
.Op Fl v
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \&, Ns Ar type Ns ...
.Nm
.Cm drop_record
.Op Fl v
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& Ns ...
.Nm
.Cm redup
.Op Fl v
.Ar file
@ -127,6 +131,21 @@ Print summary of decompressed records.
.El
.It Xo
.Nm
.Cm drop_record
.Op Fl v
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& ...
.Xc
Drop selected records from a ZFS send stream provided on standard input,
specified by object number and byte offset.
Only WRITE and WRITE_EMBEDDED are records are supported, currently.
The repaired stream will be written to standard output.
.Bl -tag -width "-v"
.It Fl v
Verbose.
Print summary of dropped records.
.El
.It Xo
.Nm
.Cm redup
.Op Fl v
.Ar file
@ -178,7 +197,7 @@ non-default level is desired).
.El
.
.Sh EXAMPLES
Heal a dataset that was corrupted due to OpenZFS bug #12762.
.Ss Recovering from OpenZFS bug #12762
First, determine which records are corrupt.
That cannot be done automatically; it requires information beyond ZFS's
metadata.
@ -193,8 +212,24 @@ then run this command:
.No # Nm zfs Ar send Fl c Ar | Nm zstream decompress Ar 128,0,lz4 | \
Nm zfs recv Ar …
.Ed
.
.Ss Recovering from OpenZFS bug #18239
The bogus records typically have an absurdly large offset, and can be seen with
a command like
.Nm zdb Fl ddddd Ar dataset Ar object
or
.Nm zstream Ar dump Fl v .
To recover, send the dataset and use
.Nm zstream
to drop the bogus record, then receive into a new dataset.
.Bd -literal
.No # Nm zfs Ar send Ar ... | Nm zstream drop_record Ar 3545761,18446744073709486080 | \
Nm zfs recv Ar ...
.Ed
.Sh SEE ALSO
.Xr zdb 8 ,
.Xr zfs 8 ,
.Xr zfs-receive 8 ,
.Xr zfs-send 8 ,
.Lk https://github.com/openzfs/zfs/issues/12762
.Lk https://github.com/openzfs/zfs/issues/18239

View File

@ -998,7 +998,8 @@ tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
'send_encrypted_props', 'send_encrypted_truncated_files',
'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files',
'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw',
'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid',
'send-wR_encrypted_zvol', 'send-zstream_drop_record',
'send_partial_dataset', 'send_invalid',
'send_large_blocks_incremental', 'send_large_blocks_initial',
'send_large_microzap_incremental', 'send_large_microzap_transitive',
'send_doall', 'send_raw_spill_block', 'send_raw_ashift',

View File

@ -2094,6 +2094,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/rsend/send_realloc_files.ksh \
functional/rsend/send_spill_block.ksh \
functional/rsend/send-wR_encrypted_zvol.ksh \
functional/rsend/send-zstream_drop_record.ksh \
functional/rsend/setup.ksh \
functional/scrub_mirror/cleanup.ksh \
functional/scrub_mirror/scrub_mirror_001_pos.ksh \

View File

@ -0,0 +1,79 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2026 by ConnectWise. All rights reserved.
#
. $STF_SUITE/tests/functional/rsend/rsend.kshlib
. $STF_SUITE/include/math.shlib
#
# Description:
# Verify that "zstream drop_record" can remove a record from a stream
#
# Strategy:
# 1. Create a file containing multiple records, both full size and embedded.
# 2. Send the dataset and drop some records
# 3. Verify the dropped records are no longer present
# 4. Verify that "zfs recv" can still receive the dataset.
verify_runnable "both"
log_assert "Verify zstream drop_record correctly drops records."
log_onexit cleanup_pool $POOL2
typeset sendfs=$POOL2/fs
typeset recvfs=$POOL2/fs2
typeset stream=$BACKDIR/stream
typeset filtered=$BACKDIR/filtered
typeset dump=$BACKDIR/dump
log_must zfs create -o compress=lz4 $sendfs
typeset dir=$(get_prop mountpoint $sendfs)
truncate -s 1m $dir/full_records
# Create some full size records
log_must dd if=/dev/urandom of=$dir/full_records conv=notrunc bs=128k count=2
# Create a file with an embedded record. I don't know how to create a file
# with two embedded records.
recsize=16384
# For lz4, this method works for blocks up to 16k, but not larger
[[ $recsize -eq $((32 * 1024)) ]] && break
if is_illumos; then
log_must mkholes -h 0:$((recsize - 8)) -d $((recsize - 8)):8 \
$dir/embedded_records
else
log_must truncate -s 16384 $dir/embedded_records
log_must dd if=/dev/urandom of=$dir/embedded_records \
seek=$((recsize - 8)) bs=1 count=8 conv=notrunc
fi
log_must zfs snapshot $sendfs@snap
typeset inode1=$(get_objnum $dir/full_records)
typeset inode2=$(get_objnum $dir/embedded_records)
# Verify that the requested records, and only them, were dropped
log_must eval "zfs send -ce $sendfs@snap > $stream"
log_must eval "zstream drop_record $inode1,131072 $inode2,0 < $stream > $filtered"
log_must eval "zstream dump -v < $filtered > $dump"
log_must grep -qE "^WRITE object = $inode1\>.*offset = 0" $dump
log_mustnot grep -qE "^WRITE object = $inode1\>.*offset = 131072" $dump
log_mustnot grep -qE "^WRITE_EMBEDDED object = $inode2\>.*offset = 0" $dump
# Verify that the stream can be received
log_must eval "zfs recv $recvfs < $stream"
log_pass "zstream drop_record correctly drops records."