diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 88aa7c91f..f1d686753 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -4746,7 +4746,7 @@ zfs_do_receive(int argc, char **argv)
nomem();
/* check options */
- while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) {
+ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsAc")) != -1) {
switch (c) {
case 'o':
if (!parseprop(props, optarg)) {
@@ -4802,6 +4802,9 @@ zfs_do_receive(int argc, char **argv)
case 'A':
abort_resumable = B_TRUE;
break;
+ case 'c':
+ flags.heal = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
diff --git a/contrib/pyzfs/libzfs_core/__init__.py b/contrib/pyzfs/libzfs_core/__init__.py
index 25ea3e495..a80f94b52 100644
--- a/contrib/pyzfs/libzfs_core/__init__.py
+++ b/contrib/pyzfs/libzfs_core/__init__.py
@@ -72,6 +72,7 @@ from ._libzfs_core import (
lzc_receive_resumable,
lzc_receive_with_cmdprops,
lzc_receive_with_header,
+ lzc_receive_with_heal,
lzc_release,
lzc_reopen,
lzc_rollback,
@@ -127,6 +128,7 @@ __all__ = [
'lzc_receive_resumable',
'lzc_receive_with_cmdprops',
'lzc_receive_with_header',
+ 'lzc_receive_with_heal',
'lzc_release',
'lzc_reopen',
'lzc_rollback',
diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py
index f494461f6..26676db39 100644
--- a/contrib/pyzfs/libzfs_core/_error_translation.py
+++ b/contrib/pyzfs/libzfs_core/_error_translation.py
@@ -469,6 +469,8 @@ def lzc_receive_translate_errors(
raise lzc_exc.ReadOnlyPool(_pool_name(snapname))
if ret == errno.EAGAIN:
raise lzc_exc.SuspendedPool(_pool_name(snapname))
+ if ret == errno.EACCES:
+ raise lzc_exc.EncryptionKeyNotLoaded()
if ret == ECKSUM:
raise lzc_exc.BadStream()
if ret == ZFS_ERR_WRONG_PARENT:
diff --git a/contrib/pyzfs/libzfs_core/_libzfs_core.py b/contrib/pyzfs/libzfs_core/_libzfs_core.py
index fcfa5be31..fa74ad9a7 100644
--- a/contrib/pyzfs/libzfs_core/_libzfs_core.py
+++ b/contrib/pyzfs/libzfs_core/_libzfs_core.py
@@ -1426,6 +1426,135 @@ def lzc_receive_with_cmdprops(
return (int(c_read_bytes[0]), action_handle)
+@_uncommitted()
+def lzc_receive_with_heal(
+ snapname, fd, begin_record, force=False, corrective=True, resumable=False,
+ raw=False, origin=None, props=None, cmdprops=None, key=None, cleanup_fd=-1,
+ action_handle=0
+):
+ '''
+ Like :func:`lzc_receive_cmdprops`, but allows the caller to pass an
+ additional 'corrective' argument. The 'corrective' boolean set to true
+ indicates that a corruption healing receive should be performed.
+
+ :param bytes snapname: the name of the snapshot to create.
+ :param int fd: the file descriptor from which to read the stream.
+ :param begin_record: the stream's begin record.
+ :type begin_record: ``cffi`` `CData` representing the dmu_replay_record_t
+ structure.
+ :param bool force: whether to roll back or destroy the target filesystem
+ if that is required to receive the stream.
+ :param bool corrective: whether this stream should be used to heal data.
+ :param bool resumable: whether this stream should be treated as resumable.
+ If the receive fails due to premature stream termination, the
+ intermediate state will be preserved on disk and may subsequently be
+ resumed with :func:`lzc_send_resume`.
+ :param bool raw: whether this is a "raw" stream.
+ :param origin: the optional origin snapshot name if the stream is for a
+ clone.
+ :type origin: bytes or None
+ :param props: the properties to set on the snapshot as *received*
+ properties.
+ :type props: dict of bytes : Any
+ :param cmdprops: the properties to set on the snapshot as local overrides
+ to *received* properties. `bool` values are forcefully inherited while
+ every other value is set locally as if the command "zfs set" was
+ invoked immediately before the receive.
+ :type cmdprops: dict of bytes : Any
+ :param key: raw bytes representing user's wrapping key
+ :type key: bytes
+ :param int cleanup_fd: file descriptor used to set a cleanup-on-exit file
+ descriptor.
+ :param int action_handle: variable used to pass the handle for guid/ds
+ mapping: this should be set to zero on first call and will contain an
+ updated handle on success, it should be passed in subsequent calls.
+
+ :return: a tuple with two elements where the first one is the number of
+ bytes read from the file descriptor and the second one is the
+ action_handle return value.
+
+ :raises IOError: if an input / output error occurs while reading from the
+ ``fd``.
+ :raises DatasetExists: if the snapshot named ``snapname`` already exists.
+ :raises DatasetExists: if the stream is a full stream and the destination
+ filesystem already exists.
+ :raises DatasetExists: if ``force`` is `True` but the destination
+ filesystem could not be rolled back to a matching snapshot because a
+ newer snapshot exists and it is an origin of a cloned filesystem.
+ :raises StreamMismatch: if an incremental stream is received and the latest
+ snapshot of the destination filesystem does not match the source
+ snapshot of the stream.
+ :raises StreamMismatch: if a full stream is received and the destination
+ filesystem already exists and it has at least one snapshot, and
+ ``force`` is `False`.
+ :raises StreamMismatch: if an incremental clone stream is received but the
+ specified ``origin`` is not the actual received origin.
+ :raises DestinationModified: if an incremental stream is received and the
+ destination filesystem has been modified since the last snapshot and
+ ``force`` is `False`.
+ :raises DestinationModified: if a full stream is received and the
+ destination filesystem already exists and it does not have any
+ snapshots, and ``force`` is `False`.
+ :raises DatasetNotFound: if the destination filesystem and its parent do
+ not exist.
+ :raises DatasetNotFound: if the ``origin`` is not `None` and does not
+ exist.
+ :raises DatasetBusy: if ``force`` is `True` but the destination filesystem
+ could not be rolled back to a matching snapshot because a newer
+ snapshot is held and could not be destroyed.
+ :raises DatasetBusy: if another receive operation is being performed on the
+ destination filesystem.
+ :raises EncryptionKeyNotLoaded: if corrective is set to true indicates the
+ key must be loaded to do a non-raw corrective recv on an encrypted
+ dataset.
+ :raises BadStream: if corrective is set to true indicates that
+ corrective recv was not able to reconstruct a corrupted block.
+ :raises BadStream: if the stream is corrupt or it is not recognized or it
+ is a compound stream or it is a clone stream, but ``origin`` is `None`.
+ :raises BadStream: if a clone stream is received and the destination
+ filesystem already exists.
+ :raises StreamFeatureNotSupported: if corrective is set to true indicates
+ stream is not compatible with the data in the pool.
+ :raises StreamFeatureNotSupported: if the stream has a feature that is not
+ supported on this side.
+ :raises ReceivePropertyFailure: if one or more of the specified properties
+ is invalid or has an invalid type or value.
+ :raises NameInvalid: if the name of either snapshot is invalid.
+ :raises NameTooLong: if the name of either snapshot is too long.
+ '''
+
+ if origin is not None:
+ c_origin = origin
+ else:
+ c_origin = _ffi.NULL
+ if action_handle is not None:
+ c_action_handle = _ffi.new("uint64_t *")
+ else:
+ c_action_handle = _ffi.NULL
+ c_read_bytes = _ffi.new("uint64_t *")
+ c_errflags = _ffi.new("uint64_t *")
+ if props is None:
+ props = {}
+ if cmdprops is None:
+ cmdprops = {}
+ if key is None:
+ key = b""
+ else:
+ key = bytes(key)
+
+ nvlist = nvlist_in(props)
+ cmdnvlist = nvlist_in(cmdprops)
+ properrs = {}
+ with nvlist_out(properrs) as c_errors:
+ ret = _lib.lzc_receive_with_heal(
+ snapname, nvlist, cmdnvlist, key, len(key), c_origin,
+ force, corrective, resumable, raw, fd, begin_record, cleanup_fd,
+ c_read_bytes, c_errflags, c_action_handle, c_errors)
+ errors.lzc_receive_translate_errors(
+ ret, snapname, fd, force, raw, False, False, origin, properrs)
+ return (int(c_read_bytes[0]), action_handle)
+
+
@_uncommitted()
def lzc_reopen(poolname, restart=True):
'''
diff --git a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py
index 1b46a0891..bcb9ed379 100644
--- a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py
+++ b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py
@@ -112,6 +112,10 @@ CDEF = """
uint8_t *, uint_t, const char *, boolean_t, boolean_t,
boolean_t, int, const dmu_replay_record_t *, int, uint64_t *,
uint64_t *, uint64_t *, nvlist_t **);
+ int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *,
+ uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t,
+ boolean_t, int, const dmu_replay_record_t *, int, uint64_t *,
+ uint64_t *, uint64_t *, nvlist_t **);
int lzc_receive_with_header(const char *, nvlist_t *, const char *,
boolean_t, boolean_t, boolean_t, int, const dmu_replay_record_t *);
int lzc_release(nvlist_t *, nvlist_t **);
diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
index 9b1aea193..c94ae6de6 100644
--- a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
+++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py
@@ -2911,6 +2911,27 @@ class ZFSTest(unittest.TestCase):
self.assertEqual(fs.getProperty("compression"), b"on")
self.assertEqual(fs.getProperty("ns:prop"), b"val")
+ def test_recv_with_heal(self):
+ snap = ZFSTest.pool.makeName(b"fs1@snap1")
+ fs = ZFSTest.pool.getFilesystem(b"fs1")
+ props = {}
+ cmdprops = {
+ b"compression": 0x01,
+ b"ns:prop": b"val"
+ }
+
+ lzc.lzc_snapshot([snap])
+ with tempfile.TemporaryFile(suffix='.zstream') as stream:
+ lzc.lzc_send(snap, None, stream.fileno())
+ stream.seek(0)
+ (header, c_header) = lzc.receive_header(stream.fileno())
+ lzc.lzc_receive_with_heal(
+ snap, stream.fileno(), c_header, props=props,
+ cmdprops=cmdprops)
+ self.assertExists(snap)
+ self.assertEqual(fs.getProperty("compression"), b"on")
+ self.assertEqual(fs.getProperty("ns:prop"), b"val")
+
def test_recv_with_cmdprops_and_recvprops(self):
fromsnap = ZFSTest.pool.makeName(b"fs1@snap1")
fs = ZFSTest.pool.getFilesystem(b"recv")
diff --git a/include/libzfs.h b/include/libzfs.h
index 52e59ac65..4948cd0d3 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -829,6 +829,9 @@ typedef struct recvflags {
/* force unmount while recv snapshot (private) */
boolean_t forceunmount;
+
+ /* use this recv to check (and heal if needed) an existing snapshot */
+ boolean_t heal;
} recvflags_t;
_LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
diff --git a/include/libzfs_core.h b/include/libzfs_core.h
index 926d11eb5..14a4857c3 100644
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -21,9 +21,9 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
*/
#ifndef _LIBZFS_CORE_H
@@ -114,6 +114,10 @@ _LIBZFS_CORE_H int lzc_receive_with_cmdprops(const char *, nvlist_t *,
nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t,
boolean_t, int, const struct dmu_replay_record *, int, uint64_t *,
uint64_t *, uint64_t *, nvlist_t **);
+_LIBZFS_CORE_H int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *,
+ uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, boolean_t,
+ int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *,
+ uint64_t *, nvlist_t **);
_LIBZFS_CORE_H int lzc_send_space(const char *, const char *,
enum lzc_send_flags, uint64_t *);
_LIBZFS_CORE_H int lzc_send_space_resume_redacted(const char *, const char *,
diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h
index 41a65e827..538c73610 100644
--- a/include/sys/dmu_recv.h
+++ b/include/sys/dmu_recv.h
@@ -24,6 +24,7 @@
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
*/
#ifndef _DMU_RECV_H
@@ -47,6 +48,7 @@ typedef struct dmu_recv_cookie {
boolean_t drc_byteswap;
uint64_t drc_featureflags;
boolean_t drc_force;
+ boolean_t drc_heal;
boolean_t drc_resumable;
boolean_t drc_should_save;
boolean_t drc_raw;
@@ -78,7 +80,7 @@ typedef struct dmu_recv_cookie {
} dmu_recv_cookie_t;
int dmu_recv_begin(char *, char *, dmu_replay_record_t *,
- boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *,
+ boolean_t, boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *,
dmu_recv_cookie_t *, zfs_file_t *, offset_t *);
int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *);
int dmu_recv_end(dmu_recv_cookie_t *, void *);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index b53439a82..e185ce6b1 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -26,10 +26,10 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
- * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Datto Inc.
*/
#ifndef _SYS_SPA_H
@@ -1134,6 +1134,7 @@ extern const char *spa_state_to_name(spa_t *spa);
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
+extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb);
extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 3fa9c80d1..469b1266e 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -25,8 +25,8 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019 Datto Inc.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -349,6 +349,7 @@ struct spa {
kmutex_t spa_errlist_lock; /* error list/ereport lock */
avl_tree_t spa_errlist_last; /* last error list */
avl_tree_t spa_errlist_scrub; /* scrub error list */
+ avl_tree_t spa_errlist_healed; /* list of healed blocks */
uint64_t spa_deflate; /* should we deflate? */
uint64_t spa_history; /* history object */
kmutex_t spa_history_lock; /* history lock */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index b6f8da760..23fdda457 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -534,6 +534,8 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *priv, enum zio_flag flags);
+extern void zio_destroy(zio_t *zio);
+
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index d0c90899a..640051e3b 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -436,6 +436,29 @@ send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv)
}
}
+/*
+ * returns snapshot guid
+ * and returns 0 if the snapshot does not exist
+ */
+static uint64_t
+get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap)
+{
+ char name[MAXPATHLEN + 1];
+ uint64_t guid = 0;
+
+ if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0')
+ return (guid);
+
+ (void) snprintf(name, sizeof (name), "%s@%s", fs, snap);
+ zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT);
+ if (zhp != NULL) {
+ guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID);
+ zfs_close(zhp);
+ }
+
+ return (guid);
+}
+
/*
* returns snapshot creation txg
* and returns 0 if the snapshot does not exist
@@ -4541,9 +4564,34 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
DMU_BACKUP_FEATURE_REDACTED;
- if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) {
+ if (flags->heal) {
+ if (flags->isprefix || flags->istail || flags->force ||
+ flags->canmountoff || flags->resumable || flags->nomount ||
+ flags->skipholds) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "corrective recv can not be used when combined with"
+ " this flag"));
+ err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
+ goto out;
+ }
+ uint64_t guid =
+ get_snap_guid(hdl, name, strchr(destsnap, '@') + 1);
+ if (guid == 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "corrective recv must specify an existing snapshot"
+ " to heal"));
+ err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
+ goto out;
+ } else if (guid != drrb->drr_toguid) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "local snapshot doesn't match the snapshot"
+ " in the provided stream"));
+ err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf);
+ goto out;
+ }
+ } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) {
zfs_cmd_t zc = {"\0"};
- zfs_handle_t *zhp;
+ zfs_handle_t *zhp = NULL;
boolean_t encrypted;
(void) strcpy(zc.zc_name, name);
@@ -4737,8 +4785,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
}
if (flags->verbose) {
- (void) printf("%s %s stream of %s into %s\n",
+ (void) printf("%s %s%s stream of %s into %s\n",
flags->dryrun ? "would receive" : "receiving",
+ flags->heal ? " corrective" : "",
drrb->drr_fromguid ? "incremental" : "full",
drrb->drr_toname, destsnap);
(void) fflush(stdout);
@@ -4808,10 +4857,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
goto out;
}
- err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops,
- oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable,
- raw, infd, drr_noswap, -1, &read_bytes, &errflags,
- NULL, &prop_errors);
+ if (flags->heal) {
+ err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops,
+ oxprops, wkeydata, wkeylen, origin, flags->force,
+ flags->heal, flags->resumable, raw, infd, drr_noswap, -1,
+ &read_bytes, &errflags, NULL, &prop_errors);
+ } else {
+ err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops,
+ oxprops, wkeydata, wkeylen, origin, flags->force,
+ flags->resumable, raw, infd, drr_noswap, -1, &read_bytes,
+ &errflags, NULL, &prop_errors);
+ }
ioctl_errno = ioctl_err;
prop_errflags = errflags;
@@ -4933,7 +4989,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
break;
case EACCES:
- if (raw && stream_wantsnewfs) {
+ if (flags->heal) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "key must be loaded to do a non-raw "
+ "corrective recv on an encrypted "
+ "dataset."));
+ } else if (raw && stream_wantsnewfs) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"failed to create encryption key"));
} else if (raw && !stream_wantsnewfs) {
@@ -4973,8 +5034,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
break;
case ECKSUM:
case ZFS_ERR_STREAM_TRUNCATED:
- recv_ecksum_set_aux(hdl, destsnap, flags->resumable,
- ioctl_err == ECKSUM);
+ if (flags->heal)
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "corrective receive was not able to "
+ "reconstruct the data needed for "
+ "healing."));
+ else
+ recv_ecksum_set_aux(hdl, destsnap,
+ flags->resumable, ioctl_err == ECKSUM);
(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
break;
case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH:
@@ -4984,8 +5051,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
break;
case ENOTSUP:
- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "pool must be upgraded to receive this stream."));
+ if (flags->heal)
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "stream is not compatible with the "
+ "data in the pool."));
+ else
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "pool must be upgraded to receive this "
+ "stream."));
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
break;
case EDQUOT:
diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi
index fae98469a..7e340e1d4 100644
--- a/lib/libzfs_core/libzfs_core.abi
+++ b/lib/libzfs_core/libzfs_core.abi
@@ -181,6 +181,7 @@
+
@@ -1741,6 +1742,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c
index d29133ab3..16bd9af1b 100644
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -22,10 +22,10 @@
/*
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
*/
/*
@@ -986,7 +986,7 @@ recv_read(int fd, void *buf, int ilen)
static int
recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force,
- boolean_t resumable, boolean_t raw, int input_fd,
+ boolean_t heal, boolean_t resumable, boolean_t raw, int input_fd,
const dmu_replay_record_t *begin_record, uint64_t *read_bytes,
uint64_t *errflags, nvlist_t **errors)
{
@@ -1041,7 +1041,7 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
/*
* All receives with a payload should use the new interface.
*/
- if (resumable || raw || wkeydata != NULL || payload) {
+ if (resumable || heal || raw || wkeydata != NULL || payload) {
nvlist_t *outnvl = NULL;
nvlist_t *innvl = fnvlist_alloc();
@@ -1081,6 +1081,8 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops,
if (resumable)
fnvlist_add_boolean(innvl, "resumable");
+ if (heal)
+ fnvlist_add_boolean(innvl, "heal");
error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl);
@@ -1180,7 +1182,7 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, boolean_t raw, int fd)
{
return (recv_impl(snapname, props, NULL, NULL, 0, origin, force,
- B_FALSE, raw, fd, NULL, NULL, NULL, NULL));
+ B_FALSE, B_FALSE, raw, fd, NULL, NULL, NULL, NULL));
}
/*
@@ -1194,7 +1196,7 @@ lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
boolean_t force, boolean_t raw, int fd)
{
return (recv_impl(snapname, props, NULL, NULL, 0, origin, force,
- B_TRUE, raw, fd, NULL, NULL, NULL, NULL));
+ B_FALSE, B_TRUE, raw, fd, NULL, NULL, NULL, NULL));
}
/*
@@ -1217,7 +1219,7 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props,
return (EINVAL);
return (recv_impl(snapname, props, NULL, NULL, 0, origin, force,
- resumable, raw, fd, begin_record, NULL, NULL, NULL));
+ B_FALSE, resumable, raw, fd, begin_record, NULL, NULL, NULL));
}
/*
@@ -1247,7 +1249,7 @@ lzc_receive_one(const char *snapname, nvlist_t *props,
{
(void) action_handle, (void) cleanup_fd;
return (recv_impl(snapname, props, NULL, NULL, 0, origin, force,
- resumable, raw, input_fd, begin_record,
+ B_FALSE, resumable, raw, input_fd, begin_record,
read_bytes, errflags, errors));
}
@@ -1269,7 +1271,27 @@ lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props,
{
(void) action_handle, (void) cleanup_fd;
return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin,
- force, resumable, raw, input_fd, begin_record,
+ force, B_FALSE, resumable, raw, input_fd, begin_record,
+ read_bytes, errflags, errors));
+}
+
+/*
+ * Like lzc_receive_with_cmdprops, but allows the caller to pass an additional
+ * 'heal' argument.
+ *
+ * The heal arguments tells us to heal the provided snapshot using the provided
+ * send stream
+ */
+int lzc_receive_with_heal(const char *snapname, nvlist_t *props,
+ nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin,
+ boolean_t force, boolean_t heal, boolean_t resumable, boolean_t raw,
+ int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd,
+ uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle,
+ nvlist_t **errors)
+{
+ (void) action_handle, (void) cleanup_fd;
+ return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin,
+ force, heal, resumable, raw, input_fd, begin_record,
read_bytes, errflags, errors));
}
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 0e208d279..cc55ee32b 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1870,6 +1870,17 @@ This setting will not reduce the write size below a single block.
Capped at a maximum of
.Sy 32 MiB .
.
+.It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int
+When this variable is set to non-zero a corrective receive:
+.Bl -enum -compact -offset 4n -width "1."
+.It
+Does not enforce the restriction of source & destination snapshot GUIDs
+matching.
+.It
+If there is an error during healing, the healing receive is not
+terminated instead it moves on to the next record.
+.El
+.
.It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq ulong
Setting this variable overrides the default logic for estimating block
sizes when doing a
diff --git a/man/man8/zfs-receive.8 b/man/man8/zfs-receive.8
index b063b1e73..22cb567c1 100644
--- a/man/man8/zfs-receive.8
+++ b/man/man8/zfs-receive.8
@@ -29,7 +29,7 @@
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\"
-.Dd March 16, 2022
+.Dd April 26, 2022
.Dt ZFS-RECEIVE 8
.Os
.
@@ -57,6 +57,12 @@
.Fl A
.Ar filesystem Ns | Ns Ar volume
.
+.Nm
+.Cm receive
+.Fl c
+.Op Fl vn
+.Ar filesystem Ns | Ns Ar snapshot
+.
.Sh DESCRIPTION
.Bl -tag -width ""
.It Xo
@@ -393,6 +399,24 @@ restrictions (e.g. set-once) apply equally to
Abort an interrupted
.Nm zfs Cm receive Fl s ,
deleting its saved partially received state.
+.It Xo
+.Nm zfs
+.Cm receive
+.Fl c
+.Op Fl vn
+.Ar filesystem Ns | Ns Ar snapshot
+.Xc
+Attempt to correct data corruption in the specified dataset,
+by using the provided stream as the source of healthy data.
+This method of healing can only heal data blocks present in the stream.
+Metadata can not be healed by corrective receive.
+Running a scrub is recommended post-healing to ensure all corruption was
+healed.
+.Pp
+It's important to consider why corruption has happened in the first place
+since if you have slowly failing hardware periodically healing the data
+is not going to save you from data loss later on when the hardware fails
+completely.
.El
.
.Sh EXAMPLES
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index a2c9bb556..58c88c7d7 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -520,6 +520,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
+ if ((flags & DMU_READ_NO_DECRYPT) != 0)
+ dbuf_flags |= DB_RF_NO_DECRYPT;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index a8f511061..55d03677f 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -27,8 +27,11 @@
* Copyright (c) 2018, loli10K . All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2022 Axcient.
*/
+#include
#include
#include
#include
@@ -67,6 +70,7 @@
static int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
static int zfs_recv_queue_ff = 20;
static int zfs_recv_write_batch_size = 1024 * 1024;
+static int zfs_recv_best_effort_corrective = 0;
static const void *const dmu_recv_tag = "dmu_recv_tag";
const char *const recv_clone_name = "%recv";
@@ -102,6 +106,8 @@ struct receive_writer_arg {
boolean_t done;
int err;
+ const char *tofs;
+ boolean_t heal;
boolean_t resumable;
boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
@@ -121,6 +127,7 @@ struct receive_writer_arg {
uint8_t or_iv[ZIO_DATA_IV_LEN];
uint8_t or_mac[ZIO_DATA_MAC_LEN];
boolean_t or_byteorder;
+ zio_t *heal_pio;
};
typedef struct dmu_recv_begin_arg {
@@ -343,9 +350,10 @@ static int
recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
uint64_t fromguid, uint64_t featureflags)
{
- uint64_t val;
+ uint64_t obj;
uint64_t children;
int error;
+ dsl_dataset_t *snap;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
@@ -354,7 +362,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
/* Temporary clone name must not exist. */
error = zap_lookup(dp->dp_meta_objset,
dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
- 8, 1, &val);
+ 8, 1, &obj);
if (error != ENOENT)
return (error == 0 ? SET_ERROR(EBUSY) : error);
@@ -362,12 +370,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
if (dsl_dataset_has_resume_receive_state(ds))
return (SET_ERROR(EBUSY));
- /* New snapshot name must not exist. */
+ /* New snapshot name must not exist if we're not healing it. */
error = zap_lookup(dp->dp_meta_objset,
dsl_dataset_phys(ds)->ds_snapnames_zapobj,
- drba->drba_cookie->drc_tosnap, 8, 1, &val);
- if (error != ENOENT)
+ drba->drba_cookie->drc_tosnap, 8, 1, &obj);
+ if (drba->drba_cookie->drc_heal) {
+ if (error != 0)
+ return (error);
+ } else if (error != ENOENT) {
return (error == 0 ? SET_ERROR(EEXIST) : error);
+ }
/* Must not have children if receiving a ZVOL. */
error = zap_count(dp->dp_meta_objset,
@@ -392,8 +404,40 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
if (error != 0)
return (error);
- if (fromguid != 0) {
- dsl_dataset_t *snap;
+ if (drba->drba_cookie->drc_heal) {
+ /* Encryption is incompatible with embedded data. */
+ if (encrypted && embed)
+ return (SET_ERROR(EINVAL));
+
+ /* Healing is not supported when in 'force' mode. */
+ if (drba->drba_cookie->drc_force)
+ return (SET_ERROR(EINVAL));
+
+ /* Must have keys loaded if doing encrypted non-raw recv. */
+ if (encrypted && !raw) {
+ if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
+ NULL, NULL) != 0)
+ return (SET_ERROR(EACCES));
+ }
+
+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
+ if (error != 0)
+ return (error);
+
+ /*
+ * When not doing best effort corrective recv healing can only
+ * be done if the send stream is for the same snapshot as the
+ * one we are trying to heal.
+ */
+ if (zfs_recv_best_effort_corrective == 0 &&
+ drba->drba_cookie->drc_drrb->drr_toguid !=
+ dsl_dataset_phys(snap)->ds_guid) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ dsl_dataset_rele(snap, FTAG);
+ } else if (fromguid != 0) {
+ /* Sanity check the incremental recv */
uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
/* Can't perform a raw receive on top of a non-raw receive */
@@ -459,7 +503,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
dsl_dataset_rele(snap, FTAG);
} else {
- /* if full, then must be forced */
+ /* If full and not healing then must be forced. */
if (!drba->drba_cookie->drc_force)
return (SET_ERROR(EEXIST));
@@ -626,6 +670,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
char buf[ZFS_MAX_DATASET_NAME_LEN];
objset_t *os;
+ /* healing recv must be done "into" an existing snapshot */
+ if (drba->drba_cookie->drc_heal == B_TRUE)
+ return (SET_ERROR(ENOTSUP));
+
/*
* If it's a non-clone incremental, we are missing the
* target fs, so fail the recv.
@@ -807,7 +855,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error == 0) {
- /* create temporary clone */
+ /* Create temporary clone unless we're doing corrective recv */
dsl_dataset_t *snap = NULL;
if (drba->drba_cookie->drc_fromsnapobj != 0) {
@@ -815,8 +863,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
ASSERT3P(dcp, ==, NULL);
}
- dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
- snap, crflags, drba->drba_cred, dcp, tx);
+ if (drc->drc_heal) {
+ /* When healing we want to use the provided snapshot */
+ VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
+ &dsobj));
+ } else {
+ dsobj = dsl_dataset_create_sync(ds->ds_dir,
+ recv_clone_name, snap, crflags, drba->drba_cred,
+ dcp, tx);
+ }
if (drba->drba_cookie->drc_fromsnapobj != 0)
dsl_dataset_rele(snap, FTAG);
dsl_dataset_rele_flags(ds, dsflags, FTAG);
@@ -933,7 +988,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
*/
rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
- (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+ (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
+ !drc->drc_heal) {
(void) dmu_objset_create_impl(dp->dp_spa,
newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
}
@@ -1141,7 +1197,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
*/
int
dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
- boolean_t force, boolean_t resumable, nvlist_t *localprops,
+ boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops,
nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc,
zfs_file_t *fp, offset_t *voffp)
{
@@ -1154,6 +1210,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
drc->drc_tosnap = tosnap;
drc->drc_tofs = tofs;
drc->drc_force = force;
+ drc->drc_heal = heal;
drc->drc_resumable = resumable;
drc->drc_cred = CRED();
drc->drc_proc = curproc;
@@ -1243,6 +1300,182 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
return (err);
}
+/*
+ * Holds data need for corrective recv callback
+ */
+typedef struct cr_cb_data {
+ uint64_t size;
+ zbookmark_phys_t zb;
+ spa_t *spa;
+} cr_cb_data_t;
+
+static void
+corrective_read_done(zio_t *zio)
+{
+ cr_cb_data_t *data = zio->io_private;
+ /* Corruption corrected; update error log if needed */
+ if (zio->io_error == 0)
+ spa_remove_error(data->spa, &data->zb);
+ kmem_free(data, sizeof (cr_cb_data_t));
+ abd_free(zio->io_abd);
+}
+
+/*
+ * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
+ */
+static int
+do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
+ struct receive_record_arg *rrd, blkptr_t *bp)
+{
+ int err;
+ zio_t *io;
+ zbookmark_phys_t zb;
+ dnode_t *dn;
+ abd_t *abd = rrd->abd;
+ zio_cksum_t bp_cksum = bp->blk_cksum;
+ enum zio_flag flags = ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL;
+
+ if (rwa->raw)
+ flags |= ZIO_FLAG_RAW;
+
+ err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
+ dbuf_whichblock(dn, 0, drrw->drr_offset));
+ dnode_rele(dn, FTAG);
+
+ if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
+ /* Decompress the stream data */
+ abd_t *dabd = abd_alloc_linear(
+ drrw->drr_logical_size, B_FALSE);
+ err = zio_decompress_data(drrw->drr_compressiontype,
+ abd, abd_to_buf(dabd), abd_get_size(abd),
+ abd_get_size(dabd), NULL);
+
+ if (err != 0) {
+ abd_free(dabd);
+ return (err);
+ }
+ /* Swap in the newly decompressed data into the abd */
+ abd_free(abd);
+ abd = dabd;
+ }
+
+ if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ /* Recompress the data */
+ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
+ B_FALSE);
+ uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
+ abd, abd_to_buf(cabd), abd_get_size(abd),
+ rwa->os->os_complevel);
+ abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
+ /* Swap in newly compressed data into the abd */
+ abd_free(abd);
+ abd = cabd;
+ flags |= ZIO_FLAG_RAW_COMPRESS;
+ }
+
+ /*
+ * The stream is not encrypted but the data on-disk is.
+ * We need to re-encrypt the buf using the same
+ * encryption type, salt, iv, and mac that was used to encrypt
+ * the block previosly.
+ */
+ if (!rwa->raw && BP_USES_CRYPT(bp)) {
+ dsl_dataset_t *ds;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+ dsl_pool_t *dp = dmu_objset_pool(rwa->os);
+ abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dsl_dataset_hold_flags(dp, rwa->tofs,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ abd_free(eabd);
+ return (SET_ERROR(EACCES));
+ }
+
+ /* Look up the key from the spa's keystore */
+ err = spa_keystore_lookup_key(rwa->os->os_spa,
+ zb.zb_objset, FTAG, &dck);
+ if (err != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
+ FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ abd_free(eabd);
+ return (SET_ERROR(EACCES));
+ }
+
+ err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+ BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
+ mac, abd_get_size(abd), abd, eabd, &no_crypt);
+
+ spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ ASSERT0(no_crypt);
+ if (err != 0) {
+ abd_free(eabd);
+ return (err);
+ }
+ /* Swap in the newly encrypted data into the abd */
+ abd_free(abd);
+ abd = eabd;
+
+ /*
+ * We want to prevent zio_rewrite() from trying to
+ * encrypt the data again
+ */
+ flags |= ZIO_FLAG_RAW_ENCRYPT;
+ }
+ rrd->abd = abd;
+
+ io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd,
+ BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb);
+
+ ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
+ abd_get_size(abd) == BP_GET_PSIZE(bp));
+
+ /* compute new bp checksum value and make sure it matches the old one */
+ zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
+ if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
+ zio_destroy(io);
+ if (zfs_recv_best_effort_corrective != 0)
+ return (0);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ /* Correct the corruption in place */
+ err = zio_wait(io);
+ if (err == 0) {
+ cr_cb_data_t *cb_data =
+ kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
+ cb_data->spa = rwa->os->os_spa;
+ cb_data->size = drrw->drr_logical_size;
+ cb_data->zb = zb;
+ /* Test if healing worked by re-reading the bp */
+ err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
+ abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
+ drrw->drr_logical_size, corrective_read_done,
+ cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
+ }
+ if (err != 0 && zfs_recv_best_effort_corrective != 0)
+ err = 0;
+
+ return (err);
+}
+
static int
receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
{
@@ -2049,6 +2282,58 @@ receive_process_write_record(struct receive_writer_arg *rwa,
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
+ if (rwa->heal) {
+ blkptr_t *bp;
+ dmu_buf_t *dbp;
+ dnode_t *dn;
+ int flags = DB_RF_CANFAIL;
+
+ if (rwa->raw)
+ flags |= DB_RF_NO_DECRYPT;
+
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
+
+ err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
+ drrw->drr_offset, FTAG, &dbp);
+ if (err != 0)
+ return (err);
+
+ /* Try to read the object to see if it needs healing */
+ err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
+ /*
+ * We only try to heal when dbuf_read() returns a ECKSUMs.
+ * Other errors (even EIO) get returned to caller.
+ * EIO indicates that the device is not present/accessible,
+ * so writing to it will likely fail.
+ * If the block is healthy, we don't want to overwrite it
+ * unnecessarily.
+ */
+ if (err != ECKSUM) {
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+ dn = dmu_buf_dnode_enter(dbp);
+ /* Make sure the on-disk block and recv record sizes match */
+ if (drrw->drr_logical_size !=
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
+ err = ENOTSUP;
+ dmu_buf_dnode_exit(dbp);
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+ /* Get the block pointer for the corrupted block */
+ bp = dmu_buf_get_blkptr(dbp);
+ err = do_corrective_recv(rwa, drrw, rrd, bp);
+ dmu_buf_dnode_exit(dbp);
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+
/*
* For resuming to work, records must be in increasing order
* by (object, offset).
@@ -2341,7 +2626,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dataset_name(ds, name);
dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
- (void) dsl_destroy_head(name);
+ if (!drc->drc_heal)
+ (void) dsl_destroy_head(name);
}
}
@@ -2702,7 +2988,19 @@ receive_process_record(struct receive_writer_arg *rwa,
ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
rwa->bytes_read = rrd->bytes_read;
- if (rrd->header.drr_type != DRR_WRITE) {
+ /* We can only heal write records; other ones get ignored */
+ if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
+ if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ return (0);
+ }
+
+ if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
err = flush_write_batch(rwa);
if (err != 0) {
if (rrd->abd != NULL) {
@@ -2737,9 +3035,16 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_WRITE:
{
err = receive_process_write_record(rwa, rrd);
- if (err != EAGAIN) {
+ if (rwa->heal) {
/*
- * On success, receive_process_write_record() returns
+ * If healing - always free the abd after processing
+ */
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ } else if (err != EAGAIN) {
+ /*
+ * On success, a non-healing
+ * receive_process_write_record() returns
* EAGAIN to indicate that we do not want to free
* the rrd or arc_buf.
*/
@@ -2830,8 +3135,9 @@ receive_writer_thread(void *arg)
* EAGAIN indicates that this record has been saved (on
* raw->write_batch), and will be used again, so we don't
* free it.
+ * When healing data we always need to free the record.
*/
- if (err != EAGAIN) {
+ if (err != EAGAIN || rwa->heal) {
if (rwa->err == 0)
rwa->err = err;
kmem_free(rrd, sizeof (*rrd));
@@ -2839,10 +3145,13 @@ receive_writer_thread(void *arg)
}
kmem_free(rrd, sizeof (*rrd));
- int err = flush_write_batch(rwa);
- if (rwa->err == 0)
- rwa->err = err;
-
+ if (rwa->heal) {
+ zio_wait(rwa->heal_pio);
+ } else {
+ int err = flush_write_batch(rwa);
+ if (rwa->err == 0)
+ rwa->err = err;
+ }
mutex_enter(&rwa->mutex);
rwa->done = B_TRUE;
cv_signal(&rwa->cv);
@@ -2926,17 +3235,19 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
if (err != 0)
goto out;
- /*
- * If this is a new dataset we set the key immediately.
- * Otherwise we don't want to change the key until we
- * are sure the rest of the receive succeeded so we stash
- * the keynvl away until then.
- */
- err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
- drc->drc_ds->ds_object, drc->drc_fromsnapobj,
- drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
- if (err != 0)
- goto out;
+ if (!drc->drc_heal) {
+ /*
+ * If this is a new dataset we set the key immediately.
+ * Otherwise we don't want to change the key until we
+ * are sure the rest of the receive succeeded so we
+ * stash the keynvl away until then.
+ */
+ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
+ drc->drc_ds->ds_object, drc->drc_fromsnapobj,
+ drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
+ if (err != 0)
+ goto out;
+ }
/* see comment in dmu_recv_end_sync() */
drc->drc_ivset_guid = 0;
@@ -2967,11 +3278,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
rwa->os = drc->drc_os;
rwa->byteswap = drc->drc_byteswap;
+ rwa->heal = drc->drc_heal;
+ rwa->tofs = drc->drc_tofs;
rwa->resumable = drc->drc_resumable;
rwa->raw = drc->drc_raw;
rwa->spill = drc->drc_spill;
rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
rwa->os->os_raw_receive = drc->drc_raw;
+ if (drc->drc_heal) {
+ rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
+ ZIO_FLAG_GODFATHER);
+ }
list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
offsetof(struct receive_record_arg, node.bqn_node));
@@ -3107,7 +3424,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
- if (!drc->drc_newfs) {
+ if (drc->drc_heal) {
+ error = 0;
+ } else if (!drc->drc_newfs) {
dsl_dataset_t *origin_head;
error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
@@ -3183,13 +3502,18 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dmu_recv_cookie_t *drc = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
- uint64_t newsnapobj;
+ uint64_t newsnapobj = 0;
spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
tx, "snap=%s", drc->drc_tosnap);
drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
- if (!drc->drc_newfs) {
+ if (drc->drc_heal) {
+ if (drc->drc_keynvl != NULL) {
+ nvlist_free(drc->drc_keynvl);
+ drc->drc_keynvl = NULL;
+ }
+ } else if (!drc->drc_newfs) {
dsl_dataset_t *origin_head;
VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
@@ -3303,7 +3627,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
* tunable is set, in which case we will leave the newly-generated
* value.
*/
- if (drc->drc_raw && drc->drc_ivset_guid != 0) {
+ if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
DMU_OT_DSL_DATASET, tx);
VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
@@ -3370,7 +3694,7 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
if (error != 0) {
dmu_recv_cleanup_ds(drc);
nvlist_free(drc->drc_keynvl);
- } else {
+ } else if (!drc->drc_heal) {
if (drc->drc_newfs) {
zvol_create_minor(drc->drc_tofs);
}
@@ -3400,3 +3724,7 @@ ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW,
"Maximum amount of writes to batch into one transaction");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
+ "Ignore errors during corrective receive");
+/* END CSTYLED */
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 81e8209cd..b2b59af42 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1315,6 +1315,9 @@ spa_activate(spa_t *spa, spa_mode_t mode)
avl_create(&spa->spa_errlist_last,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_healed,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
spa_activate_os(spa);
@@ -1425,6 +1428,7 @@ spa_deactivate(spa_t *spa)
spa_errlog_drain(spa);
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
+ avl_destroy(&spa->spa_errlist_healed);
spa_keystore_fini(&spa->spa_keystore);
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 95cf90983..4572a6e56 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -22,6 +22,7 @@
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2014, Delphix. All rights reserved.
* Copyright (c) 2021, George Amanakis. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
*/
/*
@@ -68,11 +69,13 @@
#include
#include
+#define NAME_MAX_LEN 64
+
/*
* spa_upgrade_errlog_limit : A zfs module parameter that controls the number
- * of on-disk error log entries that will be converted to the new
- * format when enabling head_errlog. Defaults to 0 which converts
- * all log entries.
+ * of on-disk error log entries that will be converted to the new
+ * format when enabling head_errlog. Defaults to 0 which converts
+ * all log entries.
*/
static uint32_t spa_upgrade_errlog_limit = 0;
@@ -511,6 +514,103 @@ get_errlist_size(spa_t *spa, avl_tree_t *tree)
}
#endif
+/*
+ * If a healed bookmark matches an entry in the error log we stash it in a tree
+ * so that we can later remove the related log entries in sync context.
+ */
+static void
+spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb)
+{
+ char name[NAME_MAX_LEN];
+
+ if (obj == 0)
+ return;
+
+ bookmark_to_name(healed_zb, name, sizeof (name));
+ mutex_enter(&spa->spa_errlog_lock);
+ if (zap_contains(spa->spa_meta_objset, obj, name) == 0) {
+ /*
+ * Found an error matching healed zb, add zb to our
+ * tree of healed errors
+ */
+ avl_tree_t *tree = &spa->spa_errlist_healed;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_index_t where;
+ search.se_bookmark = *healed_zb;
+ mutex_enter(&spa->spa_errlist_lock);
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_exit(&spa->spa_errlog_lock);
+ return;
+ }
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *healed_zb;
+ avl_insert(tree, new, where);
+ mutex_exit(&spa->spa_errlist_lock);
+ }
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
+/*
+ * If this error exists in the given tree remove it.
+ */
+static void
+remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
+{
+ spa_error_entry_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa->spa_errlist_lock);
+ search.se_bookmark = *zb;
+ if ((found = avl_find(t, &search, &where)) != NULL) {
+ avl_remove(t, found);
+ kmem_free(found, sizeof (spa_error_entry_t));
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+
+/*
+ * Removes all of the recv healed errors from both on-disk error logs
+ */
+static void
+spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
+{
+ char name[NAME_MAX_LEN];
+ spa_error_entry_t *se;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
+
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
+ &cookie)) != NULL) {
+ remove_error_from_list(spa, s, &se->se_bookmark);
+ remove_error_from_list(spa, l, &se->se_bookmark);
+ bookmark_to_name(&se->se_bookmark, name, sizeof (name));
+ kmem_free(se, sizeof (spa_error_entry_t));
+ (void) zap_remove(spa->spa_meta_objset,
+ spa->spa_errlog_last, name, tx);
+ (void) zap_remove(spa->spa_meta_objset,
+ spa->spa_errlog_scrub, name, tx);
+ }
+}
+
+/*
+ * Stash away healed bookmarks to remove them from the on-disk error logs
+ * later in spa_remove_healed_errors().
+ */
+void
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb)
+{
+ char name[NAME_MAX_LEN];
+
+ bookmark_to_name(zb, name, sizeof (name));
+
+ spa_add_healed_error(spa, spa->spa_errlog_last, zb);
+ spa_add_healed_error(spa, spa->spa_errlog_scrub, zb);
+}
+
/*
* Return the number of errors currently in the error log. This is actually the
* sum of both the last log and the current log, since we don't know the union
@@ -887,7 +987,7 @@ void
sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
{
spa_error_entry_t *se;
- char buf[64];
+ char buf[NAME_MAX_LEN];
void *cookie;
if (avl_numnodes(t) == 0)
@@ -992,6 +1092,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
*/
if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ avl_numnodes(&spa->spa_errlist_healed) == 0 &&
!spa->spa_scrub_finished) {
mutex_exit(&spa->spa_errlist_lock);
return;
@@ -1006,6 +1107,11 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ /*
+ * Remove healed errors from errors.
+ */
+ spa_remove_healed_errors(spa, &last, &scrub, tx);
+
/*
* Sync out the current list of errors.
*/
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 571e55573..382975208 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4928,7 +4928,7 @@ static boolean_t zfs_ioc_recv_inject_err;
static int
zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force,
- boolean_t resumable, int input_fd,
+ boolean_t heal, boolean_t resumable, int input_fd,
dmu_replay_record_t *begin_record, uint64_t *read_bytes,
uint64_t *errflags, nvlist_t **errors)
{
@@ -4953,7 +4953,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
return (SET_ERROR(EBADF));
noff = off = zfs_file_off(input_fp);
- error = dmu_recv_begin(tofs, tosnap, begin_record, force,
+ error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal,
resumable, localprops, hidden_args, origin, &drc, input_fp,
&off);
if (error != 0)
@@ -5296,7 +5296,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
begin_record.drr_u.drr_begin = zc->zc_begin_record;
error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
- NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record,
+ NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
&zc->zc_cookie, &zc->zc_obj, &errors);
nvlist_free(recvdprops);
nvlist_free(localprops);
@@ -5329,6 +5329,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* "begin_record" -> non-byteswapped dmu_replay_record_t
* "input_fd" -> file descriptor to read stream from (int32)
* (optional) "force" -> force flag (value ignored)
+ * (optional) "heal" -> use send stream to heal data corruption
* (optional) "resumable" -> resumable flag (value ignored)
* (optional) "cleanup_fd" -> unused
* (optional) "action_handle" -> unused
@@ -5349,6 +5350,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = {
{"begin_record", DATA_TYPE_BYTE_ARRAY, 0},
{"input_fd", DATA_TYPE_INT32, 0},
{"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
{"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
{"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
{"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL},
@@ -5369,6 +5371,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
char *tosnap;
char tofs[ZFS_MAX_DATASET_NAME_LEN];
boolean_t force;
+ boolean_t heal;
boolean_t resumable;
uint64_t read_bytes = 0;
uint64_t errflags = 0;
@@ -5398,6 +5401,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
input_fd = fnvlist_lookup_int32(innvl, "input_fd");
force = nvlist_exists(innvl, "force");
+ heal = nvlist_exists(innvl, "heal");
resumable = nvlist_exists(innvl, "resumable");
/* we still use "props" here for backwards compatibility */
@@ -5414,7 +5418,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
return (error);
error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
- hidden_args, force, resumable, input_fd, begin_record,
+ hidden_args, force, heal, resumable, input_fd, begin_record,
&read_bytes, &errflags, &errors);
fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 3d1ac36d9..7b55450ca 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -882,7 +882,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
return (zio);
}
-static void
+void
zio_destroy(zio_t *zio)
{
metaslab_trace_fini(&zio->io_alloc_list);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index a4ec27a36..8055c5193 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -241,7 +241,8 @@ tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted',
'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e',
'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props',
- 'zfs_receive_-wR-encrypted-mix']
+ 'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective',
+ 'zfs_receive_compressed_corrective']
tags = ['functional', 'cli_root', 'zfs_receive']
[tests/functional/cli_root/zfs_rename]
diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c
index e84a00273..434cc863f 100644
--- a/tests/zfs-tests/cmd/libzfs_input_check.c
+++ b/tests/zfs-tests/cmd/libzfs_input_check.c
@@ -545,6 +545,7 @@ test_recv_new(const char *dataset, int fd)
fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013");
fnvlist_add_nvlist(optional, "localprops", props);
fnvlist_add_boolean(optional, "force");
+ fnvlist_add_boolean(optional, "heal");
fnvlist_add_int32(optional, "cleanup_fd", cleanup_fd);
/*
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 4c5b11212..b13f66dc3 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -766,6 +766,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zfs_receive/zfs_receive_raw.ksh \
functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \
functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \
+ functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \
+ functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \
functional/cli_root/zfs_rename/cleanup.ksh \
functional/cli_root/zfs_rename/setup.ksh \
functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh
new file mode 100755
index 000000000..7f8eb0b13
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh
@@ -0,0 +1,193 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 Datto, Inc. All rights reserved.
+# Copyright (c) 2022 Axcient.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# OpenZFS should be able to heal data using corrective recv when the send file
+# was generated with the --compressed flag
+#
+# STRATEGY:
+# 0. Create a file, checksum the file to be corrupted then compare it's checksum
+# with the one obtained after healing under different testing scenarios:
+# 1. Test healing (aka corrective) recv from a full send file
+# 2. Test healing recv (aka heal recv) from an incremental send file
+# 3. Test healing recv when compression on-disk is off but source was compressed
+# 4. Test heal recv when compression on-disk is on but source was uncompressed
+# 5. Test heal recv when compression doesn't match between send file and on-disk
+# 6. Test healing recv of an encrypted dataset using an unencrypted send file
+# 7. Test healing recv (on an encrypted dataset) using a raw send file
+# 8. Test healing when specifying destination filesystem only (no snapshot)
+# 9. Test incremental recv aftear healing recv
+#
+
+verify_runnable "both"
+
+DISK=${DISKS%% *}
+
+backup=$TEST_BASE_DIR/backup
+raw_backup=$TEST_BASE_DIR/raw_backup
+ibackup=$TEST_BASE_DIR/ibackup
+unc_backup=$TEST_BASE_DIR/unc_backup
+
+function cleanup
+{
+ log_must rm -f $backup $raw_backup $ibackup $unc_backup
+
+ poolexists $TESTPOOL && destroy_pool $TESTPOOL
+ log_must zpool create -f $TESTPOOL $DISK
+}
+
+function test_corrective_recv
+{
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status -v $TESTPOOL
+ log_must eval "zpool status -v $TESTPOOL | \
+ grep \"Permanent errors have been detected\""
+
+ # make sure we will read the corruption from disk by flushing the ARC
+ log_must zinject -a
+
+ log_must eval "zfs recv -c $1 < $2"
+
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status -v $TESTPOOL
+ log_mustnot eval "zpool status -v $TESTPOOL | \
+ grep \"Permanent errors have been detected\""
+ typeset cksum=$(md5digest $file)
+ [[ "$cksum" == "$checksum" ]] || \
+ log_fail "Checksums differ ($cksum != $checksum)"
+}
+
+log_onexit cleanup
+
+log_assert "ZFS corrective receive should be able to heal data corruption"
+
+typeset passphrase="password"
+typeset file="/$TESTPOOL/$TESTFS1/$TESTFILE0"
+
+log_must eval "poolexists $TESTPOOL && destroy_pool $TESTPOOL"
+log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL $DISK
+
+log_must eval "echo $passphrase > /$TESTPOOL/pwd"
+
+log_must zfs create -o primarycache=none \
+ -o atime=off -o compression=lz4 $TESTPOOL/$TESTFS1
+
+log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync
+log_must eval "echo 'aaaaaaaa' >> "$file
+typeset checksum=$(md5digest $file)
+
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1
+
+# create full send file
+log_must eval "zfs send --compressed $TESTPOOL/$TESTFS1@snap1 > $backup"
+
+log_must dd if=/dev/urandom of=$file"1" bs=1024 count=1024 oflag=sync
+log_must eval "echo 'bbbbbbbb' >> "$file"1"
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap2
+# create incremental send file
+log_must eval "zfs send -c -i $TESTPOOL/$TESTFS1@snap1 \
+ $TESTPOOL/$TESTFS1@snap2 > $ibackup"
+
+corrupt_blocks_at_level $file 0
+# test healing recv from a full send file
+test_corrective_recv $TESTPOOL/$TESTFS1@snap1 $backup
+
+corrupt_blocks_at_level $file"1" 0
+# test healing recv from an incremental send file
+test_corrective_recv $TESTPOOL/$TESTFS1@snap2 $ibackup
+
+# create new uncompressed dataset using our send file
+log_must eval "zfs recv -o compression=off -o primarycache=none \
+ $TESTPOOL/$TESTFS2 < $backup"
+typeset compr=$(get_prop compression $TESTPOOL/$TESTFS2)
+[[ "$compr" == "off" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/$TESTFS2/$TESTFILE0" 0
+# test healing recv when compression on-disk is off but source was compressed
+test_corrective_recv "$TESTPOOL/$TESTFS2@snap1" $backup
+
+# create a full sendfile from an uncompressed source
+log_must eval "zfs send --compressed $TESTPOOL/$TESTFS2@snap1 > $unc_backup"
+log_must eval "zfs recv -o compression=gzip -o primarycache=none \
+ $TESTPOOL/testfs3 < $unc_backup"
+typeset compr=$(get_prop compression $TESTPOOL/testfs3)
+[[ "$compr" == "gzip" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0
+# test healing recv when compression on-disk is on but source was uncompressed
+test_corrective_recv "$TESTPOOL/testfs3@snap1" $unc_backup
+
+# create new compressed dataset using our send file
+log_must eval "zfs recv -o compression=gzip -o primarycache=none \
+ $TESTPOOL/testfs4 < $backup"
+typeset compr=$(get_prop compression $TESTPOOL/testfs4)
+[[ "$compr" == "gzip" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/testfs4/$TESTFILE0" 0
+# test healing recv when compression doesn't match between send file and on-disk
+test_corrective_recv "$TESTPOOL/testfs4@snap1" $backup
+
+# create new encrypted (and compressed) dataset using our send file
+log_must eval "zfs recv -o encryption=aes-256-ccm -o keyformat=passphrase \
+ -o keylocation=file:///$TESTPOOL/pwd -o primarycache=none \
+ $TESTPOOL/testfs5 < $backup"
+typeset encr=$(get_prop encryption $TESTPOOL/testfs5)
+[[ "$encr" == "aes-256-ccm" ]] || \
+ log_fail "Unexpected encryption $encr in recved dataset"
+log_must eval "zfs send --raw $TESTPOOL/testfs5@snap1 > $raw_backup"
+log_must eval "zfs send --compressed $TESTPOOL/testfs5@snap1 > $backup"
+corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0
+# test healing recv of an encrypted dataset using an unencrypted send file
+test_corrective_recv "$TESTPOOL/testfs5@snap1" $backup
+corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0
+log_must zfs unmount $TESTPOOL/testfs5
+log_must zfs unload-key $TESTPOOL/testfs5
+# test healing recv (on an encrypted dataset) using a raw send file
+test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup
+# non raw send file healing an encrypted dataset with an unloaded key will fail
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup"
+
+log_must zfs rollback -r $TESTPOOL/$TESTFS1@snap1
+corrupt_blocks_at_level $file 0
+# test healing when specifying destination filesystem only (no snapshot)
+test_corrective_recv $TESTPOOL/$TESTFS1 $backup
+# test incremental recv aftear healing recv
+log_must eval "zfs recv $TESTPOOL/$TESTFS1 < $ibackup"
+
+# test that healing recv can not be combined with incompatible recv options
+log_mustnot eval "zfs recv -h -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -F -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -s -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -u -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -d -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -e -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+
+# ensure healing recv doesn't work when snap GUIDS don't match
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap2 < $backup"
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5 < $backup"
+
+# test that healing recv doesn't work on non-existing snapshots
+log_mustnot eval "zfs recv -c $TESTPOOL/$TESTFS1@missing < $backup"
+
+log_pass "OpenZFS corrective recv works for data healing"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh
new file mode 100755
index 000000000..b2bbdf2a7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh
@@ -0,0 +1,192 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 Datto, Inc. All rights reserved.
+# Copyright (c) 2022 Axcient.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# OpenZFS should be able to heal data using corrective recv
+#
+# STRATEGY:
+# 0. Create a file, checksum the file to be corrupted then compare it's checksum
+# with the one obtained after healing under different testing scenarios:
+# 1. Test healing (aka corrective) recv from a full send file
+# 2. Test healing recv (aka heal recv) from an incremental send file
+# 3. Test healing recv when compression on-disk is off but source was compressed
+# 4. Test heal recv when compression on-disk is on but source was uncompressed
+# 5. Test heal recv when compression doesn't match between send file and on-disk
+# 6. Test healing recv of an encrypted dataset using an unencrypted send file
+# 7. Test healing recv (on an encrypted dataset) using a raw send file
+# 8. Test healing when specifying destination filesystem only (no snapshot)
+# 9. Test incremental recv aftear healing recv
+#
+
+verify_runnable "both"
+
+DISK=${DISKS%% *}
+
+backup=$TEST_BASE_DIR/backup
+raw_backup=$TEST_BASE_DIR/raw_backup
+ibackup=$TEST_BASE_DIR/ibackup
+unc_backup=$TEST_BASE_DIR/unc_backup
+
+function cleanup
+{
+ log_must rm -f $backup $raw_backup $ibackup $unc_backup
+
+ poolexists $TESTPOOL && destroy_pool $TESTPOOL
+ log_must zpool create -f $TESTPOOL $DISK
+}
+
+function test_corrective_recv
+{
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status -v $TESTPOOL
+ log_must eval "zpool status -v $TESTPOOL | \
+ grep \"Permanent errors have been detected\""
+
+ # make sure we will read the corruption from disk by flushing the ARC
+ log_must zinject -a
+
+ log_must eval "zfs recv -c $1 < $2"
+
+ log_must zpool scrub -w $TESTPOOL
+ log_must zpool status -v $TESTPOOL
+ log_mustnot eval "zpool status -v $TESTPOOL | \
+ grep \"Permanent errors have been detected\""
+ typeset cksum=$(md5digest $file)
+ [[ "$cksum" == "$checksum" ]] || \
+ log_fail "Checksums differ ($cksum != $checksum)"
+}
+
+log_onexit cleanup
+
+log_assert "ZFS corrective receive should be able to heal data corruption"
+
+typeset passphrase="password"
+typeset file="/$TESTPOOL/$TESTFS1/$TESTFILE0"
+
+log_must eval "poolexists $TESTPOOL && destroy_pool $TESTPOOL"
+log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL $DISK
+
+log_must eval "echo $passphrase > /$TESTPOOL/pwd"
+
+log_must zfs create -o primarycache=none \
+ -o atime=off -o compression=lz4 $TESTPOOL/$TESTFS1
+
+log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync
+log_must eval "echo 'aaaaaaaa' >> "$file
+typeset checksum=$(md5digest $file)
+
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1
+
+# create full send file
+log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 > $backup"
+
+log_must dd if=/dev/urandom of=$file"1" bs=1024 count=1024 oflag=sync
+log_must eval "echo 'bbbbbbbb' >> "$file"1"
+log_must zfs snapshot $TESTPOOL/$TESTFS1@snap2
+# create incremental send file
+log_must eval "zfs send -i $TESTPOOL/$TESTFS1@snap1 \
+ $TESTPOOL/$TESTFS1@snap2 > $ibackup"
+
+corrupt_blocks_at_level $file 0
+# test healing recv from a full send file
+test_corrective_recv $TESTPOOL/$TESTFS1@snap1 $backup
+
+corrupt_blocks_at_level $file"1" 0
+# test healing recv from an incremental send file
+test_corrective_recv $TESTPOOL/$TESTFS1@snap2 $ibackup
+
+# create new uncompressed dataset using our send file
+log_must eval "zfs recv -o compression=off -o primarycache=none \
+ $TESTPOOL/$TESTFS2 < $backup"
+typeset compr=$(get_prop compression $TESTPOOL/$TESTFS2)
+[[ "$compr" == "off" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/$TESTFS2/$TESTFILE0" 0
+# test healing recv when compression on-disk is off but source was compressed
+test_corrective_recv "$TESTPOOL/$TESTFS2@snap1" $backup
+
+# create a full sendfile from an uncompressed source
+log_must eval "zfs send $TESTPOOL/$TESTFS2@snap1 > $unc_backup"
+log_must eval "zfs recv -o compression=gzip -o primarycache=none \
+ $TESTPOOL/testfs3 < $unc_backup"
+typeset compr=$(get_prop compression $TESTPOOL/testfs3)
+[[ "$compr" == "gzip" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0
+# test healing recv when compression on-disk is on but source was uncompressed
+test_corrective_recv "$TESTPOOL/testfs3@snap1" $unc_backup
+
+# create new compressed dataset using our send file
+log_must eval "zfs recv -o compression=gzip -o primarycache=none \
+ $TESTPOOL/testfs4 < $backup"
+typeset compr=$(get_prop compression $TESTPOOL/testfs4)
+[[ "$compr" == "gzip" ]] || \
+ log_fail "Unexpected compression $compr in recved dataset"
+corrupt_blocks_at_level "/$TESTPOOL/testfs4/$TESTFILE0" 0
+# test healing recv when compression doesn't match between send file and on-disk
+test_corrective_recv "$TESTPOOL/testfs4@snap1" $backup
+
+# create new encrypted (and compressed) dataset using our send file
+log_must eval "zfs recv -o encryption=aes-256-ccm -o keyformat=passphrase \
+ -o keylocation=file:///$TESTPOOL/pwd -o primarycache=none \
+ $TESTPOOL/testfs5 < $backup"
+typeset encr=$(get_prop encryption $TESTPOOL/testfs5)
+[[ "$encr" == "aes-256-ccm" ]] || \
+ log_fail "Unexpected encryption $encr in recved dataset"
+log_must eval "zfs send --raw $TESTPOOL/testfs5@snap1 > $raw_backup"
+log_must eval "zfs send $TESTPOOL/testfs5@snap1 > $backup"
+corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0
+# test healing recv of an encrypted dataset using an unencrypted send file
+test_corrective_recv "$TESTPOOL/testfs5@snap1" $backup
+corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0
+log_must zfs unmount $TESTPOOL/testfs5
+log_must zfs unload-key $TESTPOOL/testfs5
+# test healing recv (on an encrypted dataset) using a raw send file
+test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup
+# non raw send file healing an encrypted dataset with an unloaded key will fail
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup"
+
+log_must zfs rollback -r $TESTPOOL/$TESTFS1@snap1
+corrupt_blocks_at_level $file 0
+# test healing when specifying destination filesystem only (no snapshot)
+test_corrective_recv $TESTPOOL/$TESTFS1 $backup
+# test incremental recv aftear healing recv
+log_must eval "zfs recv $TESTPOOL/$TESTFS1 < $ibackup"
+
+# test that healing recv can not be combined with incompatible recv options
+log_mustnot eval "zfs recv -h -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -F -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -s -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -u -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -d -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+log_mustnot eval "zfs recv -e -c $TESTPOOL/$TESTFS1@snap1 < $backup"
+
+# ensure healing recv doesn't work when snap GUIDS don't match
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap2 < $backup"
+log_mustnot eval "zfs recv -c $TESTPOOL/testfs5 < $backup"
+
+# test that healing recv doesn't work on non-existing snapshots
+log_mustnot eval "zfs recv -c $TESTPOOL/$TESTFS1@missing < $backup"
+
+log_pass "OpenZFS corrective recv works for data healing"