mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-04-13 07:01:46 +03:00
zstream: add a drop_record subcommand
It can be used to drop extraneous records in a send stream caused by a corrupt dataset, as in issue #18239. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alan Somers <asomers@gmail.com> Sponsored by: ConnectWise Closes #18275
This commit is contained in:
parent
7f65e04abd
commit
753f1e1e21
@ -8,6 +8,7 @@ zstream_SOURCES = \
|
||||
%D%/zstream.c \
|
||||
%D%/zstream.h \
|
||||
%D%/zstream_decompress.c \
|
||||
%D%/zstream_drop_record.c \
|
||||
%D%/zstream_dump.c \
|
||||
%D%/zstream_recompress.c \
|
||||
%D%/zstream_redup.c \
|
||||
|
||||
@ -43,6 +43,8 @@ zstream_usage(void)
|
||||
"\n"
|
||||
"\tzstream decompress [-v] [OBJECT,OFFSET[,TYPE]] ...\n"
|
||||
"\n"
|
||||
"\tzstream drop_record [-v] [OBJECT,OFFSET] ...\n"
|
||||
"\n"
|
||||
"\tzstream recompress [ -l level] TYPE\n"
|
||||
"\n"
|
||||
"\tzstream token resume_token\n"
|
||||
@ -68,6 +70,8 @@ main(int argc, char *argv[])
|
||||
return (zstream_do_dump(argc - 1, argv + 1));
|
||||
} else if (strcmp(subcommand, "decompress") == 0) {
|
||||
return (zstream_do_decompress(argc - 1, argv + 1));
|
||||
} else if (strcmp(subcommand, "drop_record") == 0) {
|
||||
return (zstream_do_drop_record(argc - 1, argv + 1));
|
||||
} else if (strcmp(subcommand, "recompress") == 0) {
|
||||
return (zstream_do_recompress(argc - 1, argv + 1));
|
||||
} else if (strcmp(subcommand, "token") == 0) {
|
||||
|
||||
@ -28,6 +28,7 @@ extern "C" {
|
||||
extern int zstream_do_redup(int, char *[]);
|
||||
extern int zstream_do_dump(int, char *[]);
|
||||
extern int zstream_do_decompress(int argc, char *argv[]);
|
||||
extern int zstream_do_drop_record(int argc, char *argv[]);
|
||||
extern int zstream_do_recompress(int argc, char *argv[]);
|
||||
extern int zstream_do_token(int, char *[]);
|
||||
extern void zstream_usage(void);
|
||||
|
||||
324
cmd/zstream/zstream_drop_record.c
Normal file
324
cmd/zstream/zstream_drop_record.c
Normal file
@ -0,0 +1,324 @@
|
||||
// SPDX-License-Identifier: CDDL-1.0
|
||||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright 2026 ConnectWise. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
|
||||
#include <err.h>
|
||||
#include <search.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/zfs_ioctl.h>
|
||||
#include <sys/zio_checksum.h>
|
||||
#include <sys/zstd/zstd.h>
|
||||
#include "zfs_fletcher.h"
|
||||
#include "zstream.h"
|
||||
#include "zstream_util.h"
|
||||
|
||||
int
|
||||
zstream_do_drop_record(int argc, char *argv[])
|
||||
{
|
||||
const int KEYSIZE = 64;
|
||||
int bufsz = SPA_MAXBLOCKSIZE;
|
||||
char *buf = safe_malloc(bufsz);
|
||||
dmu_replay_record_t thedrr;
|
||||
dmu_replay_record_t *drr = &thedrr;
|
||||
zio_cksum_t stream_cksum;
|
||||
int c;
|
||||
boolean_t verbose = B_FALSE;
|
||||
|
||||
while ((c = getopt(argc, argv, "v")) != -1) {
|
||||
switch (c) {
|
||||
case 'v':
|
||||
verbose = B_TRUE;
|
||||
break;
|
||||
case '?':
|
||||
(void) fprintf(stderr, "invalid option '%c'\n",
|
||||
optopt);
|
||||
zstream_usage();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (argc < 0)
|
||||
zstream_usage();
|
||||
|
||||
if (hcreate(argc) == 0)
|
||||
errx(1, "hcreate");
|
||||
for (int i = 0; i < argc; i++) {
|
||||
uint64_t object, offset;
|
||||
char *obj_str;
|
||||
char *offset_str;
|
||||
char *key;
|
||||
char *end;
|
||||
|
||||
obj_str = strsep(&argv[i], ",");
|
||||
if (argv[i] == NULL) {
|
||||
zstream_usage();
|
||||
exit(2);
|
||||
}
|
||||
errno = 0;
|
||||
object = strtoull(obj_str, &end, 0);
|
||||
if (errno || *end != '\0')
|
||||
errx(1, "invalid value for object");
|
||||
offset_str = strsep(&argv[i], ",");
|
||||
offset = strtoull(offset_str, &end, 0);
|
||||
if (errno || *end != '\0')
|
||||
errx(1, "invalid value for offset");
|
||||
|
||||
if (asprintf(&key, "%llu,%llu", (u_longlong_t)object,
|
||||
(u_longlong_t)offset) < 0) {
|
||||
err(1, "asprintf");
|
||||
}
|
||||
ENTRY e = {.key = key};
|
||||
ENTRY *p;
|
||||
|
||||
p = hsearch(e, ENTER);
|
||||
if (p == NULL)
|
||||
errx(1, "hsearch");
|
||||
p->data = (void*)(intptr_t)B_TRUE;
|
||||
}
|
||||
|
||||
if (isatty(STDIN_FILENO)) {
|
||||
(void) fprintf(stderr,
|
||||
"Error: The send stream is a binary format "
|
||||
"and can not be read from a\n"
|
||||
"terminal. Standard input must be redirected.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fletcher_4_init();
|
||||
int begin = 0;
|
||||
boolean_t seen = B_FALSE;
|
||||
while (sfread(drr, sizeof (*drr), stdin) != 0) {
|
||||
struct drr_write *drrw;
|
||||
uint64_t payload_size = 0;
|
||||
|
||||
/*
|
||||
* We need to regenerate the checksum.
|
||||
*/
|
||||
if (drr->drr_type != DRR_BEGIN) {
|
||||
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
||||
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
||||
}
|
||||
|
||||
switch (drr->drr_type) {
|
||||
case DRR_BEGIN:
|
||||
{
|
||||
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
||||
VERIFY0(begin++);
|
||||
seen = B_TRUE;
|
||||
|
||||
uint32_t sz = drr->drr_payloadlen;
|
||||
|
||||
VERIFY3U(sz, <=, 1U << 28);
|
||||
|
||||
if (sz != 0) {
|
||||
if (sz > bufsz) {
|
||||
buf = realloc(buf, sz);
|
||||
if (buf == NULL)
|
||||
err(1, "realloc");
|
||||
bufsz = sz;
|
||||
}
|
||||
(void) sfread(buf, sz, stdin);
|
||||
}
|
||||
payload_size = sz;
|
||||
break;
|
||||
}
|
||||
case DRR_END:
|
||||
{
|
||||
struct drr_end *drre = &drr->drr_u.drr_end;
|
||||
/*
|
||||
* We would prefer to just check --begin == 0, but
|
||||
* replication streams have an end of stream END
|
||||
* record, so we must avoid tripping it.
|
||||
*/
|
||||
VERIFY3B(seen, ==, B_TRUE);
|
||||
begin--;
|
||||
/*
|
||||
* Use the recalculated checksum, unless this is
|
||||
* the END record of a stream package, which has
|
||||
* no checksum.
|
||||
*/
|
||||
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
|
||||
drre->drr_checksum = stream_cksum;
|
||||
break;
|
||||
}
|
||||
|
||||
case DRR_OBJECT:
|
||||
{
|
||||
struct drr_object *drro = &drr->drr_u.drr_object;
|
||||
VERIFY3S(begin, ==, 1);
|
||||
|
||||
if (drro->drr_bonuslen > 0) {
|
||||
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DRR_SPILL:
|
||||
{
|
||||
struct drr_spill *drrs = &drr->drr_u.drr_spill;
|
||||
VERIFY3S(begin, ==, 1);
|
||||
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
break;
|
||||
}
|
||||
|
||||
case DRR_WRITE_BYREF:
|
||||
VERIFY3S(begin, ==, 1);
|
||||
fprintf(stderr,
|
||||
"Deduplicated streams are not supported\n");
|
||||
exit(1);
|
||||
break;
|
||||
|
||||
case DRR_WRITE:
|
||||
{
|
||||
VERIFY3S(begin, ==, 1);
|
||||
drrw = &thedrr.drr_u.drr_write;
|
||||
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
|
||||
ENTRY *p;
|
||||
char key[KEYSIZE];
|
||||
|
||||
snprintf(key, KEYSIZE, "%llu,%llu",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
ENTRY e = {.key = key};
|
||||
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
p = hsearch(e, FIND);
|
||||
if (p == NULL) {
|
||||
/*
|
||||
* Dump the contents of the block unaltered
|
||||
*/
|
||||
} else {
|
||||
/*
|
||||
* Read and discard the block
|
||||
*/
|
||||
if (verbose)
|
||||
fprintf(stderr,
|
||||
"Dropping WRITE record for object "
|
||||
"%llu offset %llu\n",
|
||||
(u_longlong_t)drrw->drr_object,
|
||||
(u_longlong_t)drrw->drr_offset);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DRR_WRITE_EMBEDDED:
|
||||
{
|
||||
ENTRY *p;
|
||||
char key[KEYSIZE];
|
||||
|
||||
VERIFY3S(begin, ==, 1);
|
||||
struct drr_write_embedded *drrwe =
|
||||
&drr->drr_u.drr_write_embedded;
|
||||
payload_size =
|
||||
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
|
||||
|
||||
snprintf(key, KEYSIZE, "%llu,%llu",
|
||||
(u_longlong_t)drrwe->drr_object,
|
||||
(u_longlong_t)drrwe->drr_offset);
|
||||
ENTRY e = {.key = key};
|
||||
|
||||
(void) sfread(buf, payload_size, stdin);
|
||||
p = hsearch(e, FIND);
|
||||
if (p == NULL) {
|
||||
/*
|
||||
* Dump the contents of the block unaltered
|
||||
*/
|
||||
} else {
|
||||
/*
|
||||
* Read and discard the block
|
||||
*/
|
||||
if (verbose)
|
||||
fprintf(stderr,
|
||||
"Dropping WRITE_EMBEDDED record for"
|
||||
" object %llu offset %llu\n",
|
||||
(u_longlong_t)drrwe->drr_object,
|
||||
(u_longlong_t)drrwe->drr_offset);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DRR_FREEOBJECTS:
|
||||
case DRR_FREE:
|
||||
case DRR_OBJECT_RANGE:
|
||||
VERIFY3S(begin, ==, 1);
|
||||
break;
|
||||
|
||||
default:
|
||||
(void) fprintf(stderr, "INVALID record type 0x%x\n",
|
||||
drr->drr_type);
|
||||
/* should never happen, so assert */
|
||||
assert(B_FALSE);
|
||||
}
|
||||
|
||||
if (feof(stdout)) {
|
||||
fprintf(stderr, "Error: unexpected end-of-file\n");
|
||||
exit(1);
|
||||
}
|
||||
if (ferror(stdout)) {
|
||||
fprintf(stderr, "Error while reading file: %s\n",
|
||||
strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to recalculate the checksum, and it needs to be
|
||||
* initially zero to do that. BEGIN records don't have
|
||||
* a checksum.
|
||||
*/
|
||||
if (drr->drr_type != DRR_BEGIN) {
|
||||
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
||||
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
||||
}
|
||||
if (dump_record(drr, buf, payload_size,
|
||||
&stream_cksum, STDOUT_FILENO) != 0)
|
||||
break;
|
||||
if (drr->drr_type == DRR_END) {
|
||||
/*
|
||||
* Typically the END record is either the last
|
||||
* thing in the stream, or it is followed
|
||||
* by a BEGIN record (which also zeros the checksum).
|
||||
* However, a stream package ends with two END
|
||||
* records. The last END record's checksum starts
|
||||
* from zero.
|
||||
*/
|
||||
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
free(buf);
|
||||
fletcher_4_fini();
|
||||
hdestroy();
|
||||
|
||||
return (0);
|
||||
}
|
||||
@ -21,7 +21,7 @@
|
||||
.\"
|
||||
.\" Copyright (c) 2020 by Delphix. All rights reserved.
|
||||
.\"
|
||||
.Dd November 10, 2022
|
||||
.Dd February 20, 2026
|
||||
.Dt ZSTREAM 8
|
||||
.Os
|
||||
.
|
||||
@ -38,6 +38,10 @@
|
||||
.Op Fl v
|
||||
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \&, Ns Ar type Ns ...
|
||||
.Nm
|
||||
.Cm drop_record
|
||||
.Op Fl v
|
||||
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& Ns ...
|
||||
.Nm
|
||||
.Cm redup
|
||||
.Op Fl v
|
||||
.Ar file
|
||||
@ -127,6 +131,21 @@ Print summary of decompressed records.
|
||||
.El
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm drop_record
|
||||
.Op Fl v
|
||||
.Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \& ...
|
||||
.Xc
|
||||
Drop selected records from a ZFS send stream provided on standard input,
|
||||
specified by object number and byte offset.
|
||||
Only WRITE and WRITE_EMBEDDED are records are supported, currently.
|
||||
The repaired stream will be written to standard output.
|
||||
.Bl -tag -width "-v"
|
||||
.It Fl v
|
||||
Verbose.
|
||||
Print summary of dropped records.
|
||||
.El
|
||||
.It Xo
|
||||
.Nm
|
||||
.Cm redup
|
||||
.Op Fl v
|
||||
.Ar file
|
||||
@ -178,7 +197,7 @@ non-default level is desired).
|
||||
.El
|
||||
.
|
||||
.Sh EXAMPLES
|
||||
Heal a dataset that was corrupted due to OpenZFS bug #12762.
|
||||
.Ss Recovering from OpenZFS bug #12762
|
||||
First, determine which records are corrupt.
|
||||
That cannot be done automatically; it requires information beyond ZFS's
|
||||
metadata.
|
||||
@ -193,8 +212,24 @@ then run this command:
|
||||
.No # Nm zfs Ar send Fl c Ar … | Nm zstream decompress Ar 128,0,lz4 | \
|
||||
Nm zfs recv Ar …
|
||||
.Ed
|
||||
.
|
||||
.Ss Recovering from OpenZFS bug #18239
|
||||
The bogus records typically have an absurdly large offset, and can be seen with
|
||||
a command like
|
||||
.Nm zdb Fl ddddd Ar dataset Ar object
|
||||
or
|
||||
.Nm zstream Ar dump Fl v .
|
||||
To recover, send the dataset and use
|
||||
.Nm zstream
|
||||
to drop the bogus record, then receive into a new dataset.
|
||||
.Bd -literal
|
||||
.No # Nm zfs Ar send Ar ... | Nm zstream drop_record Ar 3545761,18446744073709486080 | \
|
||||
Nm zfs recv Ar ...
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr zdb 8 ,
|
||||
.Xr zfs 8 ,
|
||||
.Xr zfs-receive 8 ,
|
||||
.Xr zfs-send 8 ,
|
||||
.Lk https://github.com/openzfs/zfs/issues/12762
|
||||
.Lk https://github.com/openzfs/zfs/issues/18239
|
||||
|
||||
@ -998,7 +998,8 @@ tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
|
||||
'send_encrypted_props', 'send_encrypted_truncated_files',
|
||||
'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files',
|
||||
'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw',
|
||||
'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid',
|
||||
'send-wR_encrypted_zvol', 'send-zstream_drop_record',
|
||||
'send_partial_dataset', 'send_invalid',
|
||||
'send_large_blocks_incremental', 'send_large_blocks_initial',
|
||||
'send_large_microzap_incremental', 'send_large_microzap_transitive',
|
||||
'send_doall', 'send_raw_spill_block', 'send_raw_ashift',
|
||||
|
||||
@ -2094,6 +2094,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/rsend/send_realloc_files.ksh \
|
||||
functional/rsend/send_spill_block.ksh \
|
||||
functional/rsend/send-wR_encrypted_zvol.ksh \
|
||||
functional/rsend/send-zstream_drop_record.ksh \
|
||||
functional/rsend/setup.ksh \
|
||||
functional/scrub_mirror/cleanup.ksh \
|
||||
functional/scrub_mirror/scrub_mirror_001_pos.ksh \
|
||||
|
||||
79
tests/zfs-tests/tests/functional/rsend/send-zstream_drop_record.ksh
Executable file
79
tests/zfs-tests/tests/functional/rsend/send-zstream_drop_record.ksh
Executable file
@ -0,0 +1,79 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2026 by ConnectWise. All rights reserved.
|
||||
#
|
||||
|
||||
. $STF_SUITE/tests/functional/rsend/rsend.kshlib
|
||||
. $STF_SUITE/include/math.shlib
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Verify that "zstream drop_record" can remove a record from a stream
|
||||
#
|
||||
# Strategy:
|
||||
# 1. Create a file containing multiple records, both full size and embedded.
|
||||
# 2. Send the dataset and drop some records
|
||||
# 3. Verify the dropped records are no longer present
|
||||
# 4. Verify that "zfs recv" can still receive the dataset.
|
||||
|
||||
verify_runnable "both"
|
||||
|
||||
log_assert "Verify zstream drop_record correctly drops records."
|
||||
log_onexit cleanup_pool $POOL2
|
||||
|
||||
typeset sendfs=$POOL2/fs
|
||||
typeset recvfs=$POOL2/fs2
|
||||
typeset stream=$BACKDIR/stream
|
||||
typeset filtered=$BACKDIR/filtered
|
||||
typeset dump=$BACKDIR/dump
|
||||
|
||||
log_must zfs create -o compress=lz4 $sendfs
|
||||
typeset dir=$(get_prop mountpoint $sendfs)
|
||||
|
||||
truncate -s 1m $dir/full_records
|
||||
# Create some full size records
|
||||
log_must dd if=/dev/urandom of=$dir/full_records conv=notrunc bs=128k count=2
|
||||
|
||||
# Create a file with an embedded record. I don't know how to create a file
|
||||
# with two embedded records.
|
||||
recsize=16384
|
||||
# For lz4, this method works for blocks up to 16k, but not larger
|
||||
[[ $recsize -eq $((32 * 1024)) ]] && break
|
||||
if is_illumos; then
|
||||
log_must mkholes -h 0:$((recsize - 8)) -d $((recsize - 8)):8 \
|
||||
$dir/embedded_records
|
||||
else
|
||||
log_must truncate -s 16384 $dir/embedded_records
|
||||
log_must dd if=/dev/urandom of=$dir/embedded_records \
|
||||
seek=$((recsize - 8)) bs=1 count=8 conv=notrunc
|
||||
fi
|
||||
|
||||
log_must zfs snapshot $sendfs@snap
|
||||
typeset inode1=$(get_objnum $dir/full_records)
|
||||
typeset inode2=$(get_objnum $dir/embedded_records)
|
||||
|
||||
# Verify that the requested records, and only them, were dropped
|
||||
log_must eval "zfs send -ce $sendfs@snap > $stream"
|
||||
log_must eval "zstream drop_record $inode1,131072 $inode2,0 < $stream > $filtered"
|
||||
log_must eval "zstream dump -v < $filtered > $dump"
|
||||
log_must grep -qE "^WRITE object = $inode1\>.*offset = 0" $dump
|
||||
log_mustnot grep -qE "^WRITE object = $inode1\>.*offset = 131072" $dump
|
||||
log_mustnot grep -qE "^WRITE_EMBEDDED object = $inode2\>.*offset = 0" $dump
|
||||
|
||||
# Verify that the stream can be received
|
||||
log_must eval "zfs recv $recvfs < $stream"
|
||||
|
||||
log_pass "zstream drop_record correctly drops records."
|
||||
Loading…
Reference in New Issue
Block a user