diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 88aa7c91f..f1d686753 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4746,7 +4746,7 @@ zfs_do_receive(int argc, char **argv) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { + while ((c = getopt(argc, argv, ":o:x:dehMnuvFsAc")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { @@ -4802,6 +4802,9 @@ zfs_do_receive(int argc, char **argv) case 'A': abort_resumable = B_TRUE; break; + case 'c': + flags.heal = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); diff --git a/contrib/pyzfs/libzfs_core/__init__.py b/contrib/pyzfs/libzfs_core/__init__.py index 25ea3e495..a80f94b52 100644 --- a/contrib/pyzfs/libzfs_core/__init__.py +++ b/contrib/pyzfs/libzfs_core/__init__.py @@ -72,6 +72,7 @@ from ._libzfs_core import ( lzc_receive_resumable, lzc_receive_with_cmdprops, lzc_receive_with_header, + lzc_receive_with_heal, lzc_release, lzc_reopen, lzc_rollback, @@ -127,6 +128,7 @@ __all__ = [ 'lzc_receive_resumable', 'lzc_receive_with_cmdprops', 'lzc_receive_with_header', + 'lzc_receive_with_heal', 'lzc_release', 'lzc_reopen', 'lzc_rollback', diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index f494461f6..26676db39 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -469,6 +469,8 @@ def lzc_receive_translate_errors( raise lzc_exc.ReadOnlyPool(_pool_name(snapname)) if ret == errno.EAGAIN: raise lzc_exc.SuspendedPool(_pool_name(snapname)) + if ret == errno.EACCES: + raise lzc_exc.EncryptionKeyNotLoaded() if ret == ECKSUM: raise lzc_exc.BadStream() if ret == ZFS_ERR_WRONG_PARENT: diff --git a/contrib/pyzfs/libzfs_core/_libzfs_core.py b/contrib/pyzfs/libzfs_core/_libzfs_core.py index fcfa5be31..fa74ad9a7 100644 --- a/contrib/pyzfs/libzfs_core/_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/_libzfs_core.py @@ -1426,6 +1426,135 @@ def lzc_receive_with_cmdprops( return (int(c_read_bytes[0]), action_handle) +@_uncommitted() +def lzc_receive_with_heal( + snapname, fd, begin_record, force=False, corrective=True, resumable=False, + raw=False, origin=None, props=None, cmdprops=None, key=None, cleanup_fd=-1, + action_handle=0 +): + ''' + Like :func:`lzc_receive_cmdprops`, but allows the caller to pass an + additional 'corrective' argument. The 'corrective' boolean set to true + indicates that a corruption healing receive should be performed. + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param begin_record: the stream's begin record. + :type begin_record: ``cffi`` `CData` representing the dmu_replay_record_t + structure. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool corrective: whether this stream should be used to heal data. + :param bool resumable: whether this stream should be treated as resumable. + If the receive fails due to premature stream termination, the + intermediate state will be preserved on disk and may subsequently be + resumed with :func:`lzc_send_resume`. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + :param cmdprops: the properties to set on the snapshot as local overrides + to *received* properties. `bool` values are forcefully inherited while + every other value is set locally as if the command "zfs set" was + invoked immediately before the receive. + :type cmdprops: dict of bytes : Any + :param key: raw bytes representing user's wrapping key + :type key: bytes + :param int cleanup_fd: file descriptor used to set a cleanup-on-exit file + descriptor. + :param int action_handle: variable used to pass the handle for guid/ds + mapping: this should be set to zero on first call and will contain an + updated handle on success, it should be passed in subsequent calls. + + :return: a tuple with two elements where the first one is the number of + bytes read from the file descriptor and the second one is the + action_handle return value. + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises EncryptionKeyNotLoaded: if corrective is set to true indicates the + key must be loaded to do a non-raw corrective recv on an encrypted + dataset. + :raises BadStream: if corrective is set to true indicates that + corrective recv was not able to reconstruct a corrupted block. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if corrective is set to true indicates + stream is not compatible with the data in the pool. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises ReceivePropertyFailure: if one or more of the specified properties + is invalid or has an invalid type or value. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if action_handle is not None: + c_action_handle = _ffi.new("uint64_t *") + else: + c_action_handle = _ffi.NULL + c_read_bytes = _ffi.new("uint64_t *") + c_errflags = _ffi.new("uint64_t *") + if props is None: + props = {} + if cmdprops is None: + cmdprops = {} + if key is None: + key = b"" + else: + key = bytes(key) + + nvlist = nvlist_in(props) + cmdnvlist = nvlist_in(cmdprops) + properrs = {} + with nvlist_out(properrs) as c_errors: + ret = _lib.lzc_receive_with_heal( + snapname, nvlist, cmdnvlist, key, len(key), c_origin, + force, corrective, resumable, raw, fd, begin_record, cleanup_fd, + c_read_bytes, c_errflags, c_action_handle, c_errors) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, properrs) + return (int(c_read_bytes[0]), action_handle) + + @_uncommitted() def lzc_reopen(poolname, restart=True): ''' diff --git a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py index 1b46a0891..bcb9ed379 100644 --- a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py @@ -112,6 +112,10 @@ CDEF = """ uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const dmu_replay_record_t *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); + int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *, + uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, + boolean_t, int, const dmu_replay_record_t *, int, uint64_t *, + uint64_t *, uint64_t *, nvlist_t **); int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, boolean_t, boolean_t, int, const dmu_replay_record_t *); int lzc_release(nvlist_t *, nvlist_t **); diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py index 9b1aea193..c94ae6de6 100644 --- a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -2911,6 +2911,27 @@ class ZFSTest(unittest.TestCase): self.assertEqual(fs.getProperty("compression"), b"on") self.assertEqual(fs.getProperty("ns:prop"), b"val") + def test_recv_with_heal(self): + snap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.getFilesystem(b"fs1") + props = {} + cmdprops = { + b"compression": 0x01, + b"ns:prop": b"val" + } + + lzc.lzc_snapshot([snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(snap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + lzc.lzc_receive_with_heal( + snap, stream.fileno(), c_header, props=props, + cmdprops=cmdprops) + self.assertExists(snap) + self.assertEqual(fs.getProperty("compression"), b"on") + self.assertEqual(fs.getProperty("ns:prop"), b"val") + def test_recv_with_cmdprops_and_recvprops(self): fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") fs = ZFSTest.pool.getFilesystem(b"recv") diff --git a/include/libzfs.h b/include/libzfs.h index 52e59ac65..4948cd0d3 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -829,6 +829,9 @@ typedef struct recvflags { /* force unmount while recv snapshot (private) */ boolean_t forceunmount; + + /* use this recv to check (and heal if needed) an existing snapshot */ + boolean_t heal; } recvflags_t; _LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 926d11eb5..14a4857c3 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -21,9 +21,9 @@ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. */ #ifndef _LIBZFS_CORE_H @@ -114,6 +114,10 @@ _LIBZFS_CORE_H int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *, + uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, boolean_t, + int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, + uint64_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); _LIBZFS_CORE_H int lzc_send_space_resume_redacted(const char *, const char *, diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index 41a65e827..538c73610 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ #ifndef _DMU_RECV_H @@ -47,6 +48,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; uint64_t drc_featureflags; boolean_t drc_force; + boolean_t drc_heal; boolean_t drc_resumable; boolean_t drc_should_save; boolean_t drc_raw; @@ -78,7 +80,7 @@ typedef struct dmu_recv_cookie { } dmu_recv_cookie_t; int dmu_recv_begin(char *, char *, dmu_replay_record_t *, - boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, + boolean_t, boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, dmu_recv_cookie_t *, zfs_file_t *, offset_t *); int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); int dmu_recv_end(dmu_recv_cookie_t *, void *); diff --git a/include/sys/spa.h b/include/sys/spa.h index b53439a82..e185ce6b1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -26,10 +26,10 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Datto Inc. */ #ifndef _SYS_SPA_H @@ -1134,6 +1134,7 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 3fa9c80d1..469b1266e 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -25,8 +25,8 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H @@ -349,6 +349,7 @@ struct spa { kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ + avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ diff --git a/include/sys/zio.h b/include/sys/zio.h index b6f8da760..23fdda457 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -534,6 +534,8 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, enum zio_flag flags); +extern void zio_destroy(zio_t *zio); + extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index d0c90899a..640051e3b 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -436,6 +436,29 @@ send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) } } +/* + * returns snapshot guid + * and returns 0 if the snapshot does not exist + */ +static uint64_t +get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap) +{ + char name[MAXPATHLEN + 1]; + uint64_t guid = 0; + + if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') + return (guid); + + (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); + zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID); + zfs_close(zhp); + } + + return (guid); +} + /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist @@ -4541,9 +4564,34 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; - if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { + if (flags->heal) { + if (flags->isprefix || flags->istail || flags->force || + flags->canmountoff || flags->resumable || flags->nomount || + flags->skipholds) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective recv can not be used when combined with" + " this flag")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } + uint64_t guid = + get_snap_guid(hdl, name, strchr(destsnap, '@') + 1); + if (guid == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective recv must specify an existing snapshot" + " to heal")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } else if (guid != drrb->drr_toguid) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "local snapshot doesn't match the snapshot" + " in the provided stream")); + err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); + goto out; + } + } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp; + zfs_handle_t *zhp = NULL; boolean_t encrypted; (void) strcpy(zc.zc_name, name); @@ -4737,8 +4785,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } if (flags->verbose) { - (void) printf("%s %s stream of %s into %s\n", + (void) printf("%s %s%s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", + flags->heal ? " corrective" : "", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); @@ -4808,10 +4857,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, goto out; } - err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, - oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, - raw, infd, drr_noswap, -1, &read_bytes, &errflags, - NULL, &prop_errors); + if (flags->heal) { + err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops, + oxprops, wkeydata, wkeylen, origin, flags->force, + flags->heal, flags->resumable, raw, infd, drr_noswap, -1, + &read_bytes, &errflags, NULL, &prop_errors); + } else { + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, + oxprops, wkeydata, wkeylen, origin, flags->force, + flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, + &errflags, NULL, &prop_errors); + } ioctl_errno = ioctl_err; prop_errflags = errflags; @@ -4933,7 +4989,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: - if (raw && stream_wantsnewfs) { + if (flags->heal) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "key must be loaded to do a non-raw " + "corrective recv on an encrypted " + "dataset.")); + } else if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { @@ -4973,8 +5034,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, break; case ECKSUM: case ZFS_ERR_STREAM_TRUNCATED: - recv_ecksum_set_aux(hdl, destsnap, flags->resumable, - ioctl_err == ECKSUM); + if (flags->heal) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective receive was not able to " + "reconstruct the data needed for " + "healing.")); + else + recv_ecksum_set_aux(hdl, destsnap, + flags->resumable, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: @@ -4984,8 +5051,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to receive this stream.")); + if (flags->heal) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream is not compatible with the " + "data in the pool.")); + else + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to receive this " + "stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index fae98469a..7e340e1d4 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -181,6 +181,7 @@ + @@ -1741,6 +1742,26 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index d29133ab3..16bd9af1b 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -22,10 +22,10 @@ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ /* @@ -986,7 +986,7 @@ recv_read(int fd, void *buf, int ilen) static int recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, - boolean_t resumable, boolean_t raw, int input_fd, + boolean_t heal, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { @@ -1041,7 +1041,7 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, /* * All receives with a payload should use the new interface. */ - if (resumable || raw || wkeydata != NULL || payload) { + if (resumable || heal || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); @@ -1081,6 +1081,8 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, if (resumable) fnvlist_add_boolean(innvl, "resumable"); + if (heal) + fnvlist_add_boolean(innvl, "heal"); error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl); @@ -1180,7 +1182,7 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); + B_FALSE, B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); } /* @@ -1194,7 +1196,7 @@ lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); + B_FALSE, B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); } /* @@ -1217,7 +1219,7 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props, return (EINVAL); return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, fd, begin_record, NULL, NULL, NULL)); + B_FALSE, resumable, raw, fd, begin_record, NULL, NULL, NULL)); } /* @@ -1247,7 +1249,7 @@ lzc_receive_one(const char *snapname, nvlist_t *props, { (void) action_handle, (void) cleanup_fd; return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, input_fd, begin_record, + B_FALSE, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } @@ -1269,7 +1271,27 @@ lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, { (void) action_handle, (void) cleanup_fd; return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, - force, resumable, raw, input_fd, begin_record, + force, B_FALSE, resumable, raw, input_fd, begin_record, + read_bytes, errflags, errors)); +} + +/* + * Like lzc_receive_with_cmdprops, but allows the caller to pass an additional + * 'heal' argument. + * + * The heal arguments tells us to heal the provided snapshot using the provided + * send stream + */ +int lzc_receive_with_heal(const char *snapname, nvlist_t *props, + nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, + boolean_t force, boolean_t heal, boolean_t resumable, boolean_t raw, + int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, + uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, + nvlist_t **errors) +{ + (void) action_handle, (void) cleanup_fd; + return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, + force, heal, resumable, raw, input_fd, begin_record, read_bytes, errflags, errors)); } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 0e208d279..cc55ee32b 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1870,6 +1870,17 @@ This setting will not reduce the write size below a single block. Capped at a maximum of .Sy 32 MiB . . +.It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int +When this variable is set to non-zero a corrective receive: +.Bl -enum -compact -offset 4n -width "1." +.It +Does not enforce the restriction of source & destination snapshot GUIDs +matching. +.It +If there is an error during healing, the healing receive is not +terminated instead it moves on to the next record. +.El +. .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq ulong Setting this variable overrides the default logic for estimating block sizes when doing a diff --git a/man/man8/zfs-receive.8 b/man/man8/zfs-receive.8 index b063b1e73..22cb567c1 100644 --- a/man/man8/zfs-receive.8 +++ b/man/man8/zfs-receive.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd April 26, 2022 .Dt ZFS-RECEIVE 8 .Os . @@ -57,6 +57,12 @@ .Fl A .Ar filesystem Ns | Ns Ar volume . +.Nm +.Cm receive +.Fl c +.Op Fl vn +.Ar filesystem Ns | Ns Ar snapshot +. .Sh DESCRIPTION .Bl -tag -width "" .It Xo @@ -393,6 +399,24 @@ restrictions (e.g. set-once) apply equally to Abort an interrupted .Nm zfs Cm receive Fl s , deleting its saved partially received state. +.It Xo +.Nm zfs +.Cm receive +.Fl c +.Op Fl vn +.Ar filesystem Ns | Ns Ar snapshot +.Xc +Attempt to correct data corruption in the specified dataset, +by using the provided stream as the source of healthy data. +This method of healing can only heal data blocks present in the stream. +Metadata can not be healed by corrective receive. +Running a scrub is recommended post-healing to ensure all corruption was +healed. +.Pp +It's important to consider why corruption has happened in the first place +since if you have slowly failing hardware periodically healing the data +is not going to save you from data loss later on when the hardware fails +completely. .El . .Sh EXAMPLES diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index a2c9bb556..58c88c7d7 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -520,6 +520,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; + if ((flags & DMU_READ_NO_DECRYPT) != 0) + dbuf_flags |= DB_RF_NO_DECRYPT; + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index a8f511061..55d03677f 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -27,8 +27,11 @@ * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2022 Axcient. */ +#include #include #include #include @@ -67,6 +70,7 @@ static int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; static int zfs_recv_queue_ff = 20; static int zfs_recv_write_batch_size = 1024 * 1024; +static int zfs_recv_best_effort_corrective = 0; static const void *const dmu_recv_tag = "dmu_recv_tag"; const char *const recv_clone_name = "%recv"; @@ -102,6 +106,8 @@ struct receive_writer_arg { boolean_t done; int err; + const char *tofs; + boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ @@ -121,6 +127,7 @@ struct receive_writer_arg { uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; + zio_t *heal_pio; }; typedef struct dmu_recv_begin_arg { @@ -343,9 +350,10 @@ static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { - uint64_t val; + uint64_t obj; uint64_t children; int error; + dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; @@ -354,7 +362,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, - 8, 1, &val); + 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); @@ -362,12 +370,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); - /* New snapshot name must not exist. */ + /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, - drba->drba_cookie->drc_tosnap, 8, 1, &val); - if (error != ENOENT) + drba->drba_cookie->drc_tosnap, 8, 1, &obj); + if (drba->drba_cookie->drc_heal) { + if (error != 0) + return (error); + } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); + } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, @@ -392,8 +404,40 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (error != 0) return (error); - if (fromguid != 0) { - dsl_dataset_t *snap; + if (drba->drba_cookie->drc_heal) { + /* Encryption is incompatible with embedded data. */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + + /* Healing is not supported when in 'force' mode. */ + if (drba->drba_cookie->drc_force) + return (SET_ERROR(EINVAL)); + + /* Must have keys loaded if doing encrypted non-raw recv. */ + if (encrypted && !raw) { + if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, + NULL, NULL) != 0) + return (SET_ERROR(EACCES)); + } + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); + if (error != 0) + return (error); + + /* + * When not doing best effort corrective recv healing can only + * be done if the send stream is for the same snapshot as the + * one we are trying to heal. + */ + if (zfs_recv_best_effort_corrective == 0 && + drba->drba_cookie->drc_drrb->drr_toguid != + dsl_dataset_phys(snap)->ds_guid) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + dsl_dataset_rele(snap, FTAG); + } else if (fromguid != 0) { + /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ @@ -459,7 +503,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); } else { - /* if full, then must be forced */ + /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); @@ -626,6 +670,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) char buf[ZFS_MAX_DATASET_NAME_LEN]; objset_t *os; + /* healing recv must be done "into" an existing snapshot */ + if (drba->drba_cookie->drc_heal == B_TRUE) + return (SET_ERROR(ENOTSUP)); + /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. @@ -807,7 +855,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { - /* create temporary clone */ + /* Create temporary clone unless we're doing corrective recv */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { @@ -815,8 +863,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, dcp, tx); + if (drc->drc_heal) { + /* When healing we want to use the provided snapshot */ + VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, + &dsobj)); + } else { + dsobj = dsl_dataset_create_sync(ds->ds_dir, + recv_clone_name, snap, crflags, drba->drba_cred, + dcp, tx); + } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); @@ -933,7 +988,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && - (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } @@ -1141,7 +1197,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) */ int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, nvlist_t *localprops, + boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp, offset_t *voffp) { @@ -1154,6 +1210,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_proc = curproc; @@ -1243,6 +1300,182 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, return (err); } +/* + * Holds data need for corrective recv callback + */ +typedef struct cr_cb_data { + uint64_t size; + zbookmark_phys_t zb; + spa_t *spa; +} cr_cb_data_t; + +static void +corrective_read_done(zio_t *zio) +{ + cr_cb_data_t *data = zio->io_private; + /* Corruption corrected; update error log if needed */ + if (zio->io_error == 0) + spa_remove_error(data->spa, &data->zb); + kmem_free(data, sizeof (cr_cb_data_t)); + abd_free(zio->io_abd); +} + +/* + * zio_rewrite the data pointed to by bp with the data from the rrd's abd. + */ +static int +do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, + struct receive_record_arg *rrd, blkptr_t *bp) +{ + int err; + zio_t *io; + zbookmark_phys_t zb; + dnode_t *dn; + abd_t *abd = rrd->abd; + zio_cksum_t bp_cksum = bp->blk_cksum; + enum zio_flag flags = ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; + + if (rwa->raw) + flags |= ZIO_FLAG_RAW; + + err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn); + if (err != 0) + return (err); + SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0, + dbuf_whichblock(dn, 0, drrw->drr_offset)); + dnode_rele(dn, FTAG); + + if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) { + /* Decompress the stream data */ + abd_t *dabd = abd_alloc_linear( + drrw->drr_logical_size, B_FALSE); + err = zio_decompress_data(drrw->drr_compressiontype, + abd, abd_to_buf(dabd), abd_get_size(abd), + abd_get_size(dabd), NULL); + + if (err != 0) { + abd_free(dabd); + return (err); + } + /* Swap in the newly decompressed data into the abd */ + abd_free(abd); + abd = dabd; + } + + if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* Recompress the data */ + abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), + B_FALSE); + uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), + abd, abd_to_buf(cabd), abd_get_size(abd), + rwa->os->os_complevel); + abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); + /* Swap in newly compressed data into the abd */ + abd_free(abd); + abd = cabd; + flags |= ZIO_FLAG_RAW_COMPRESS; + } + + /* + * The stream is not encrypted but the data on-disk is. + * We need to re-encrypt the buf using the same + * encryption type, salt, iv, and mac that was used to encrypt + * the block previosly. + */ + if (!rwa->raw && BP_USES_CRYPT(bp)) { + dsl_dataset_t *ds; + dsl_crypto_key_t *dck = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + dsl_pool_t *dp = dmu_objset_pool(rwa->os); + abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); + + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_flags(dp, rwa->tofs, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + abd_free(eabd); + return (SET_ERROR(EACCES)); + } + + /* Look up the key from the spa's keystore */ + err = spa_keystore_lookup_key(rwa->os->os_spa, + zb.zb_objset, FTAG, &dck); + if (err != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); + dsl_pool_config_exit(dp, FTAG); + abd_free(eabd); + return (SET_ERROR(EACCES)); + } + + err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, + mac, abd_get_size(abd), abd, eabd, &no_crypt); + + spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + dsl_pool_config_exit(dp, FTAG); + + ASSERT0(no_crypt); + if (err != 0) { + abd_free(eabd); + return (err); + } + /* Swap in the newly encrypted data into the abd */ + abd_free(abd); + abd = eabd; + + /* + * We want to prevent zio_rewrite() from trying to + * encrypt the data again + */ + flags |= ZIO_FLAG_RAW_ENCRYPT; + } + rrd->abd = abd; + + io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd, + BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); + + ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || + abd_get_size(abd) == BP_GET_PSIZE(bp)); + + /* compute new bp checksum value and make sure it matches the old one */ + zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd)); + if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { + zio_destroy(io); + if (zfs_recv_best_effort_corrective != 0) + return (0); + return (SET_ERROR(ECKSUM)); + } + + /* Correct the corruption in place */ + err = zio_wait(io); + if (err == 0) { + cr_cb_data_t *cb_data = + kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); + cb_data->spa = rwa->os->os_spa; + cb_data->size = drrw->drr_logical_size; + cb_data->zb = zb; + /* Test if healing worked by re-reading the bp */ + err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, + abd_alloc_for_io(drrw->drr_logical_size, B_FALSE), + drrw->drr_logical_size, corrective_read_done, + cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); + } + if (err != 0 && zfs_recv_best_effort_corrective != 0) + err = 0; + + return (err); +} + static int receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { @@ -2049,6 +2282,58 @@ receive_process_write_record(struct receive_writer_arg *rwa, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + if (rwa->heal) { + blkptr_t *bp; + dmu_buf_t *dbp; + dnode_t *dn; + int flags = DB_RF_CANFAIL; + + if (rwa->raw) + flags |= DB_RF_NO_DECRYPT; + + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, + drrw->drr_offset, FTAG, &dbp); + if (err != 0) + return (err); + + /* Try to read the object to see if it needs healing */ + err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags); + /* + * We only try to heal when dbuf_read() returns a ECKSUMs. + * Other errors (even EIO) get returned to caller. + * EIO indicates that the device is not present/accessible, + * so writing to it will likely fail. + * If the block is healthy, we don't want to overwrite it + * unnecessarily. + */ + if (err != ECKSUM) { + dmu_buf_rele(dbp, FTAG); + return (err); + } + dn = dmu_buf_dnode_enter(dbp); + /* Make sure the on-disk block and recv record sizes match */ + if (drrw->drr_logical_size != + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) { + err = ENOTSUP; + dmu_buf_dnode_exit(dbp); + dmu_buf_rele(dbp, FTAG); + return (err); + } + /* Get the block pointer for the corrupted block */ + bp = dmu_buf_get_blkptr(dbp); + err = do_corrective_recv(rwa, drrw, rrd, bp); + dmu_buf_dnode_exit(dbp); + dmu_buf_rele(dbp, FTAG); + return (err); + } + /* * For resuming to work, records must be in increasing order * by (object, offset). @@ -2341,7 +2626,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (!drc->drc_heal) + (void) dsl_destroy_head(name); } } @@ -2702,7 +2988,19 @@ receive_process_record(struct receive_writer_arg *rwa, ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; - if (rrd->header.drr_type != DRR_WRITE) { + /* We can only heal write records; other ones get ignored */ + if (rwa->heal && rrd->header.drr_type != DRR_WRITE) { + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + return (0); + } + + if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { if (rrd->abd != NULL) { @@ -2737,9 +3035,16 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_WRITE: { err = receive_process_write_record(rwa, rrd); - if (err != EAGAIN) { + if (rwa->heal) { /* - * On success, receive_process_write_record() returns + * If healing - always free the abd after processing + */ + abd_free(rrd->abd); + rrd->abd = NULL; + } else if (err != EAGAIN) { + /* + * On success, a non-healing + * receive_process_write_record() returns * EAGAIN to indicate that we do not want to free * the rrd or arc_buf. */ @@ -2830,8 +3135,9 @@ receive_writer_thread(void *arg) * EAGAIN indicates that this record has been saved (on * raw->write_batch), and will be used again, so we don't * free it. + * When healing data we always need to free the record. */ - if (err != EAGAIN) { + if (err != EAGAIN || rwa->heal) { if (rwa->err == 0) rwa->err = err; kmem_free(rrd, sizeof (*rrd)); @@ -2839,10 +3145,13 @@ receive_writer_thread(void *arg) } kmem_free(rrd, sizeof (*rrd)); - int err = flush_write_batch(rwa); - if (rwa->err == 0) - rwa->err = err; - + if (rwa->heal) { + zio_wait(rwa->heal_pio); + } else { + int err = flush_write_batch(rwa); + if (rwa->err == 0) + rwa->err = err; + } mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); @@ -2926,17 +3235,19 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) if (err != 0) goto out; - /* - * If this is a new dataset we set the key immediately. - * Otherwise we don't want to change the key until we - * are sure the rest of the receive succeeded so we stash - * the keynvl away until then. - */ - err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), - drc->drc_ds->ds_object, drc->drc_fromsnapobj, - drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); - if (err != 0) - goto out; + if (!drc->drc_heal) { + /* + * If this is a new dataset we set the key immediately. + * Otherwise we don't want to change the key until we + * are sure the rest of the receive succeeded so we + * stash the keynvl away until then. + */ + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + drc->drc_ds->ds_object, drc->drc_fromsnapobj, + drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); + if (err != 0) + goto out; + } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; @@ -2967,11 +3278,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; + rwa->heal = drc->drc_heal; + rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; + if (drc->drc_heal) { + rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, + ZIO_FLAG_GODFATHER); + } list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); @@ -3107,7 +3424,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); - if (!drc->drc_newfs) { + if (drc->drc_heal) { + error = 0; + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); @@ -3183,13 +3502,18 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; - uint64_t newsnapobj; + uint64_t newsnapobj = 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; - if (!drc->drc_newfs) { + if (drc->drc_heal) { + if (drc->drc_keynvl != NULL) { + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, @@ -3303,7 +3627,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) * tunable is set, in which case we will leave the newly-generated * value. */ - if (drc->drc_raw && drc->drc_ivset_guid != 0) { + if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, @@ -3370,7 +3694,7 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); - } else { + } else if (!drc->drc_heal) { if (drc->drc_newfs) { zvol_create_minor(drc->drc_tofs); } @@ -3400,3 +3724,7 @@ ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW, "Maximum amount of writes to batch into one transaction"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, + "Ignore errors during corrective receive"); +/* END CSTYLED */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 81e8209cd..b2b59af42 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1315,6 +1315,9 @@ spa_activate(spa_t *spa, spa_mode_t mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_healed, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); spa_activate_os(spa); @@ -1425,6 +1428,7 @@ spa_deactivate(spa_t *spa) spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); + avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 95cf90983..4572a6e56 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -22,6 +22,7 @@ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014, Delphix. All rights reserved. * Copyright (c) 2021, George Amanakis. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ /* @@ -68,11 +69,13 @@ #include #include +#define NAME_MAX_LEN 64 + /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number - * of on-disk error log entries that will be converted to the new - * format when enabling head_errlog. Defaults to 0 which converts - * all log entries. + * of on-disk error log entries that will be converted to the new + * format when enabling head_errlog. Defaults to 0 which converts + * all log entries. */ static uint32_t spa_upgrade_errlog_limit = 0; @@ -511,6 +514,103 @@ get_errlist_size(spa_t *spa, avl_tree_t *tree) } #endif +/* + * If a healed bookmark matches an entry in the error log we stash it in a tree + * so that we can later remove the related log entries in sync context. + */ +static void +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) +{ + char name[NAME_MAX_LEN]; + + if (obj == 0) + return; + + bookmark_to_name(healed_zb, name, sizeof (name)); + mutex_enter(&spa->spa_errlog_lock); + if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + mutex_enter(&spa->spa_errlist_lock); + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + mutex_exit(&spa->spa_errlog_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + mutex_exit(&spa->spa_errlist_lock); + } + mutex_exit(&spa->spa_errlog_lock); +} + +/* + * If this error exists in the given tree remove it. + */ +static void +remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) +{ + spa_error_entry_t search, *found; + avl_index_t where; + + mutex_enter(&spa->spa_errlist_lock); + search.se_bookmark = *zb; + if ((found = avl_find(t, &search, &where)) != NULL) { + avl_remove(t, found); + kmem_free(found, sizeof (spa_error_entry_t)); + } + mutex_exit(&spa->spa_errlist_lock); +} + + +/* + * Removes all of the recv healed errors from both on-disk error logs + */ +static void +spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) +{ + char name[NAME_MAX_LEN]; + spa_error_entry_t *se; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); + + while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, + &cookie)) != NULL) { + remove_error_from_list(spa, s, &se->se_bookmark); + remove_error_from_list(spa, l, &se->se_bookmark); + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + kmem_free(se, sizeof (spa_error_entry_t)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } +} + +/* + * Stash away healed bookmarks to remove them from the on-disk error logs + * later in spa_remove_healed_errors(). + */ +void +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) +{ + char name[NAME_MAX_LEN]; + + bookmark_to_name(zb, name, sizeof (name)); + + spa_add_healed_error(spa, spa->spa_errlog_last, zb); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); +} + /* * Return the number of errors currently in the error log. This is actually the * sum of both the last log and the current log, since we don't know the union @@ -887,7 +987,7 @@ void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; - char buf[64]; + char buf[NAME_MAX_LEN]; void *cookie; if (avl_numnodes(t) == 0) @@ -992,6 +1092,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && avl_numnodes(&spa->spa_errlist_last) == 0 && + avl_numnodes(&spa->spa_errlist_healed) == 0 && !spa->spa_scrub_finished) { mutex_exit(&spa->spa_errlist_lock); return; @@ -1006,6 +1107,11 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + /* + * Remove healed errors from errors. + */ + spa_remove_healed_errors(spa, &last, &scrub, tx); + /* * Sync out the current list of errors. */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 571e55573..382975208 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4928,7 +4928,7 @@ static boolean_t zfs_ioc_recv_inject_err; static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, - boolean_t resumable, int input_fd, + boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { @@ -4953,7 +4953,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); - error = dmu_recv_begin(tofs, tosnap, begin_record, force, + error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) @@ -5296,7 +5296,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, - NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, + NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); @@ -5329,6 +5329,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) + * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused @@ -5349,6 +5350,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, @@ -5369,6 +5371,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; + boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; @@ -5398,6 +5401,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); + heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ @@ -5414,7 +5418,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, - hidden_args, force, resumable, input_fd, begin_record, + hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3d1ac36d9..7b55450ca 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -882,7 +882,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, return (zio); } -static void +void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a4ec27a36..8055c5193 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -241,7 +241,8 @@ tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props', - 'zfs_receive_-wR-encrypted-mix'] + 'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective', + 'zfs_receive_compressed_corrective'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index e84a00273..434cc863f 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -545,6 +545,7 @@ test_recv_new(const char *dataset, int fd) fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); fnvlist_add_nvlist(optional, "localprops", props); fnvlist_add_boolean(optional, "force"); + fnvlist_add_boolean(optional, "heal"); fnvlist_add_int32(optional, "cleanup_fd", cleanup_fd); /* diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4c5b11212..b13f66dc3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -766,6 +766,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_receive/zfs_receive_raw.ksh \ functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \ + functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \ + functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \ functional/cli_root/zfs_rename/cleanup.ksh \ functional/cli_root/zfs_rename/setup.ksh \ functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh new file mode 100755 index 000000000..7f8eb0b13 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh @@ -0,0 +1,193 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# Copyright (c) 2022 Axcient. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# OpenZFS should be able to heal data using corrective recv when the send file +# was generated with the --compressed flag +# +# STRATEGY: +# 0. Create a file, checksum the file to be corrupted then compare it's checksum +# with the one obtained after healing under different testing scenarios: +# 1. Test healing (aka corrective) recv from a full send file +# 2. Test healing recv (aka heal recv) from an incremental send file +# 3. Test healing recv when compression on-disk is off but source was compressed +# 4. Test heal recv when compression on-disk is on but source was uncompressed +# 5. Test heal recv when compression doesn't match between send file and on-disk +# 6. Test healing recv of an encrypted dataset using an unencrypted send file +# 7. Test healing recv (on an encrypted dataset) using a raw send file +# 8. Test healing when specifying destination filesystem only (no snapshot) +# 9. Test incremental recv aftear healing recv +# + +verify_runnable "both" + +DISK=${DISKS%% *} + +backup=$TEST_BASE_DIR/backup +raw_backup=$TEST_BASE_DIR/raw_backup +ibackup=$TEST_BASE_DIR/ibackup +unc_backup=$TEST_BASE_DIR/unc_backup + +function cleanup +{ + log_must rm -f $backup $raw_backup $ibackup $unc_backup + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must zpool create -f $TESTPOOL $DISK +} + +function test_corrective_recv +{ + log_must zpool scrub -w $TESTPOOL + log_must zpool status -v $TESTPOOL + log_must eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + + # make sure we will read the corruption from disk by flushing the ARC + log_must zinject -a + + log_must eval "zfs recv -c $1 < $2" + + log_must zpool scrub -w $TESTPOOL + log_must zpool status -v $TESTPOOL + log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + typeset cksum=$(md5digest $file) + [[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum != $checksum)" +} + +log_onexit cleanup + +log_assert "ZFS corrective receive should be able to heal data corruption" + +typeset passphrase="password" +typeset file="/$TESTPOOL/$TESTFS1/$TESTFILE0" + +log_must eval "poolexists $TESTPOOL && destroy_pool $TESTPOOL" +log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL $DISK + +log_must eval "echo $passphrase > /$TESTPOOL/pwd" + +log_must zfs create -o primarycache=none \ + -o atime=off -o compression=lz4 $TESTPOOL/$TESTFS1 + +log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync +log_must eval "echo 'aaaaaaaa' >> "$file +typeset checksum=$(md5digest $file) + +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + +# create full send file +log_must eval "zfs send --compressed $TESTPOOL/$TESTFS1@snap1 > $backup" + +log_must dd if=/dev/urandom of=$file"1" bs=1024 count=1024 oflag=sync +log_must eval "echo 'bbbbbbbb' >> "$file"1" +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap2 +# create incremental send file +log_must eval "zfs send -c -i $TESTPOOL/$TESTFS1@snap1 \ + $TESTPOOL/$TESTFS1@snap2 > $ibackup" + +corrupt_blocks_at_level $file 0 +# test healing recv from a full send file +test_corrective_recv $TESTPOOL/$TESTFS1@snap1 $backup + +corrupt_blocks_at_level $file"1" 0 +# test healing recv from an incremental send file +test_corrective_recv $TESTPOOL/$TESTFS1@snap2 $ibackup + +# create new uncompressed dataset using our send file +log_must eval "zfs recv -o compression=off -o primarycache=none \ + $TESTPOOL/$TESTFS2 < $backup" +typeset compr=$(get_prop compression $TESTPOOL/$TESTFS2) +[[ "$compr" == "off" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/$TESTFS2/$TESTFILE0" 0 +# test healing recv when compression on-disk is off but source was compressed +test_corrective_recv "$TESTPOOL/$TESTFS2@snap1" $backup + +# create a full sendfile from an uncompressed source +log_must eval "zfs send --compressed $TESTPOOL/$TESTFS2@snap1 > $unc_backup" +log_must eval "zfs recv -o compression=gzip -o primarycache=none \ + $TESTPOOL/testfs3 < $unc_backup" +typeset compr=$(get_prop compression $TESTPOOL/testfs3) +[[ "$compr" == "gzip" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0 +# test healing recv when compression on-disk is on but source was uncompressed +test_corrective_recv "$TESTPOOL/testfs3@snap1" $unc_backup + +# create new compressed dataset using our send file +log_must eval "zfs recv -o compression=gzip -o primarycache=none \ + $TESTPOOL/testfs4 < $backup" +typeset compr=$(get_prop compression $TESTPOOL/testfs4) +[[ "$compr" == "gzip" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/testfs4/$TESTFILE0" 0 +# test healing recv when compression doesn't match between send file and on-disk +test_corrective_recv "$TESTPOOL/testfs4@snap1" $backup + +# create new encrypted (and compressed) dataset using our send file +log_must eval "zfs recv -o encryption=aes-256-ccm -o keyformat=passphrase \ + -o keylocation=file:///$TESTPOOL/pwd -o primarycache=none \ + $TESTPOOL/testfs5 < $backup" +typeset encr=$(get_prop encryption $TESTPOOL/testfs5) +[[ "$encr" == "aes-256-ccm" ]] || \ + log_fail "Unexpected encryption $encr in recved dataset" +log_must eval "zfs send --raw $TESTPOOL/testfs5@snap1 > $raw_backup" +log_must eval "zfs send --compressed $TESTPOOL/testfs5@snap1 > $backup" +corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 +# test healing recv of an encrypted dataset using an unencrypted send file +test_corrective_recv "$TESTPOOL/testfs5@snap1" $backup +corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 +log_must zfs unmount $TESTPOOL/testfs5 +log_must zfs unload-key $TESTPOOL/testfs5 +# test healing recv (on an encrypted dataset) using a raw send file +test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup +# non raw send file healing an encrypted dataset with an unloaded key will fail +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup" + +log_must zfs rollback -r $TESTPOOL/$TESTFS1@snap1 +corrupt_blocks_at_level $file 0 +# test healing when specifying destination filesystem only (no snapshot) +test_corrective_recv $TESTPOOL/$TESTFS1 $backup +# test incremental recv aftear healing recv +log_must eval "zfs recv $TESTPOOL/$TESTFS1 < $ibackup" + +# test that healing recv can not be combined with incompatible recv options +log_mustnot eval "zfs recv -h -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -F -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -s -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -u -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -d -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -e -c $TESTPOOL/$TESTFS1@snap1 < $backup" + +# ensure healing recv doesn't work when snap GUIDS don't match +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap2 < $backup" +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5 < $backup" + +# test that healing recv doesn't work on non-existing snapshots +log_mustnot eval "zfs recv -c $TESTPOOL/$TESTFS1@missing < $backup" + +log_pass "OpenZFS corrective recv works for data healing" diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh new file mode 100755 index 000000000..b2bbdf2a7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh @@ -0,0 +1,192 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# Copyright (c) 2022 Axcient. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# OpenZFS should be able to heal data using corrective recv +# +# STRATEGY: +# 0. Create a file, checksum the file to be corrupted then compare it's checksum +# with the one obtained after healing under different testing scenarios: +# 1. Test healing (aka corrective) recv from a full send file +# 2. Test healing recv (aka heal recv) from an incremental send file +# 3. Test healing recv when compression on-disk is off but source was compressed +# 4. Test heal recv when compression on-disk is on but source was uncompressed +# 5. Test heal recv when compression doesn't match between send file and on-disk +# 6. Test healing recv of an encrypted dataset using an unencrypted send file +# 7. Test healing recv (on an encrypted dataset) using a raw send file +# 8. Test healing when specifying destination filesystem only (no snapshot) +# 9. Test incremental recv aftear healing recv +# + +verify_runnable "both" + +DISK=${DISKS%% *} + +backup=$TEST_BASE_DIR/backup +raw_backup=$TEST_BASE_DIR/raw_backup +ibackup=$TEST_BASE_DIR/ibackup +unc_backup=$TEST_BASE_DIR/unc_backup + +function cleanup +{ + log_must rm -f $backup $raw_backup $ibackup $unc_backup + + poolexists $TESTPOOL && destroy_pool $TESTPOOL + log_must zpool create -f $TESTPOOL $DISK +} + +function test_corrective_recv +{ + log_must zpool scrub -w $TESTPOOL + log_must zpool status -v $TESTPOOL + log_must eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + + # make sure we will read the corruption from disk by flushing the ARC + log_must zinject -a + + log_must eval "zfs recv -c $1 < $2" + + log_must zpool scrub -w $TESTPOOL + log_must zpool status -v $TESTPOOL + log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + typeset cksum=$(md5digest $file) + [[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum != $checksum)" +} + +log_onexit cleanup + +log_assert "ZFS corrective receive should be able to heal data corruption" + +typeset passphrase="password" +typeset file="/$TESTPOOL/$TESTFS1/$TESTFILE0" + +log_must eval "poolexists $TESTPOOL && destroy_pool $TESTPOOL" +log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL $DISK + +log_must eval "echo $passphrase > /$TESTPOOL/pwd" + +log_must zfs create -o primarycache=none \ + -o atime=off -o compression=lz4 $TESTPOOL/$TESTFS1 + +log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync +log_must eval "echo 'aaaaaaaa' >> "$file +typeset checksum=$(md5digest $file) + +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + +# create full send file +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 > $backup" + +log_must dd if=/dev/urandom of=$file"1" bs=1024 count=1024 oflag=sync +log_must eval "echo 'bbbbbbbb' >> "$file"1" +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap2 +# create incremental send file +log_must eval "zfs send -i $TESTPOOL/$TESTFS1@snap1 \ + $TESTPOOL/$TESTFS1@snap2 > $ibackup" + +corrupt_blocks_at_level $file 0 +# test healing recv from a full send file +test_corrective_recv $TESTPOOL/$TESTFS1@snap1 $backup + +corrupt_blocks_at_level $file"1" 0 +# test healing recv from an incremental send file +test_corrective_recv $TESTPOOL/$TESTFS1@snap2 $ibackup + +# create new uncompressed dataset using our send file +log_must eval "zfs recv -o compression=off -o primarycache=none \ + $TESTPOOL/$TESTFS2 < $backup" +typeset compr=$(get_prop compression $TESTPOOL/$TESTFS2) +[[ "$compr" == "off" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/$TESTFS2/$TESTFILE0" 0 +# test healing recv when compression on-disk is off but source was compressed +test_corrective_recv "$TESTPOOL/$TESTFS2@snap1" $backup + +# create a full sendfile from an uncompressed source +log_must eval "zfs send $TESTPOOL/$TESTFS2@snap1 > $unc_backup" +log_must eval "zfs recv -o compression=gzip -o primarycache=none \ + $TESTPOOL/testfs3 < $unc_backup" +typeset compr=$(get_prop compression $TESTPOOL/testfs3) +[[ "$compr" == "gzip" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0 +# test healing recv when compression on-disk is on but source was uncompressed +test_corrective_recv "$TESTPOOL/testfs3@snap1" $unc_backup + +# create new compressed dataset using our send file +log_must eval "zfs recv -o compression=gzip -o primarycache=none \ + $TESTPOOL/testfs4 < $backup" +typeset compr=$(get_prop compression $TESTPOOL/testfs4) +[[ "$compr" == "gzip" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/testfs4/$TESTFILE0" 0 +# test healing recv when compression doesn't match between send file and on-disk +test_corrective_recv "$TESTPOOL/testfs4@snap1" $backup + +# create new encrypted (and compressed) dataset using our send file +log_must eval "zfs recv -o encryption=aes-256-ccm -o keyformat=passphrase \ + -o keylocation=file:///$TESTPOOL/pwd -o primarycache=none \ + $TESTPOOL/testfs5 < $backup" +typeset encr=$(get_prop encryption $TESTPOOL/testfs5) +[[ "$encr" == "aes-256-ccm" ]] || \ + log_fail "Unexpected encryption $encr in recved dataset" +log_must eval "zfs send --raw $TESTPOOL/testfs5@snap1 > $raw_backup" +log_must eval "zfs send $TESTPOOL/testfs5@snap1 > $backup" +corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 +# test healing recv of an encrypted dataset using an unencrypted send file +test_corrective_recv "$TESTPOOL/testfs5@snap1" $backup +corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 +log_must zfs unmount $TESTPOOL/testfs5 +log_must zfs unload-key $TESTPOOL/testfs5 +# test healing recv (on an encrypted dataset) using a raw send file +test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup +# non raw send file healing an encrypted dataset with an unloaded key will fail +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup" + +log_must zfs rollback -r $TESTPOOL/$TESTFS1@snap1 +corrupt_blocks_at_level $file 0 +# test healing when specifying destination filesystem only (no snapshot) +test_corrective_recv $TESTPOOL/$TESTFS1 $backup +# test incremental recv aftear healing recv +log_must eval "zfs recv $TESTPOOL/$TESTFS1 < $ibackup" + +# test that healing recv can not be combined with incompatible recv options +log_mustnot eval "zfs recv -h -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -F -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -s -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -u -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -d -c $TESTPOOL/$TESTFS1@snap1 < $backup" +log_mustnot eval "zfs recv -e -c $TESTPOOL/$TESTFS1@snap1 < $backup" + +# ensure healing recv doesn't work when snap GUIDS don't match +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap2 < $backup" +log_mustnot eval "zfs recv -c $TESTPOOL/testfs5 < $backup" + +# test that healing recv doesn't work on non-existing snapshots +log_mustnot eval "zfs recv -c $TESTPOOL/$TESTFS1@missing < $backup" + +log_pass "OpenZFS corrective recv works for data healing"