mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-12 19:20:28 +03:00
OpenZFS 6393 - zfs receive a full send as a clone
Authored by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Approved by: Dan McDonald <danmcd@omniti.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6394 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/68ecb2e
This commit is contained in:
parent
fd41e93563
commit
e6d3a843d6
@ -24,7 +24,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_DMU_IMPL_H
|
||||
@ -268,7 +268,6 @@ typedef struct dmu_sendarg {
|
||||
uint64_t dsa_toguid;
|
||||
int dsa_err;
|
||||
dmu_pendop_t dsa_pending_op;
|
||||
boolean_t dsa_incremental;
|
||||
uint64_t dsa_featureflags;
|
||||
uint64_t dsa_last_data_object;
|
||||
uint64_t dsa_last_data_offset;
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_ZFS_IOCTL_H
|
||||
@ -138,6 +138,16 @@ typedef enum dmu_send_resume_token_version {
|
||||
|
||||
#define DRR_FLAG_CLONE (1<<0)
|
||||
#define DRR_FLAG_CI_DATA (1<<1)
|
||||
/*
|
||||
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
|
||||
* records that are created by the sending process. This means that the send
|
||||
* stream can be received as a clone, even though it is not an incremental.
|
||||
* This is not implemented as a feature flag, because the receiving side does
|
||||
* not need to have implemented it to receive this stream; it is fully backwards
|
||||
* compatible. We need a flag, though, because full send streams without it
|
||||
* cannot necessarily be received as a clone correctly.
|
||||
*/
|
||||
#define DRR_FLAG_FREERECORDS (1<<2)
|
||||
|
||||
/*
|
||||
* flags in the drr_checksumflags field in the DRR_WRITE and
|
||||
|
@ -22,7 +22,7 @@
|
||||
.\"
|
||||
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
|
||||
.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
|
||||
.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
|
||||
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
|
||||
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
@ -2991,7 +2991,12 @@ Discard all but the last element of the sent snapshot's file system name, using
|
||||
.ad
|
||||
.sp .6
|
||||
.RS 4n
|
||||
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
|
||||
Forces the stream to be received as a clone of the given snapshot.
|
||||
If the stream is a full send stream, this will create the filesystem
|
||||
described by the stream as a clone of the specified snapshot. Which
|
||||
snapshot was specified will not affect the success or failure of the
|
||||
receive, as long as the snapshot does exist. If the stream is an
|
||||
incremental send stream, all the normal verification will be performed.
|
||||
.RE
|
||||
|
||||
.RE
|
||||
|
@ -20,11 +20,10 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
|
||||
* Copyright 2014 HybridCluster. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
|
||||
*/
|
||||
|
||||
@ -173,6 +172,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill in the drr_free struct, or perform aggregation if the previous record is
|
||||
* also a free record, and the two are adjacent.
|
||||
*
|
||||
* Note that we send free records even for a full send, because we want to be
|
||||
* able to receive a full send as a clone, which requires a list of all the free
|
||||
* and freeobject records that were generated on the source.
|
||||
*/
|
||||
static int
|
||||
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
|
||||
uint64_t length)
|
||||
@ -196,15 +203,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
|
||||
(object == dsp->dsa_last_data_object &&
|
||||
offset > dsp->dsa_last_data_offset));
|
||||
|
||||
/*
|
||||
* If we are doing a non-incremental send, then there can't
|
||||
* be any data in the dataset we're receiving into. Therefore
|
||||
* a free record would simply be a no-op. Save space by not
|
||||
* sending it to begin with.
|
||||
*/
|
||||
if (!dsp->dsa_incremental)
|
||||
return (0);
|
||||
|
||||
if (length != -1ULL && offset + length < offset)
|
||||
length = -1ULL;
|
||||
|
||||
@ -382,10 +380,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
|
||||
{
|
||||
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
|
||||
|
||||
/* See comment in dump_free(). */
|
||||
if (!dsp->dsa_incremental)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
|
||||
* push it out, since free block aggregation can only be done for
|
||||
@ -796,6 +790,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
|
||||
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
|
||||
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
|
||||
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
|
||||
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
|
||||
|
||||
if (ancestor_zb != NULL) {
|
||||
drr->drr_u.drr_begin.drr_fromguid =
|
||||
@ -818,7 +813,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
|
||||
dsp->dsa_off = off;
|
||||
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
|
||||
dsp->dsa_pending_op = PENDING_NONE;
|
||||
dsp->dsa_incremental = (ancestor_zb != NULL);
|
||||
dsp->dsa_featureflags = featureflags;
|
||||
dsp->dsa_resume_object = resumeobj;
|
||||
dsp->dsa_resume_offset = resumeoff;
|
||||
@ -1336,7 +1330,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
|
||||
/* target fs already exists; recv into temp clone */
|
||||
|
||||
/* Can't recv a clone into an existing fs */
|
||||
if (flags & DRR_FLAG_CLONE) {
|
||||
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
@ -1355,6 +1349,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
|
||||
drba->drba_origin))
|
||||
return (SET_ERROR(ENOENT));
|
||||
|
||||
/*
|
||||
* If we're receiving a full send as a clone, and it doesn't
|
||||
* contain all the necessary free records and freeobject
|
||||
* records, reject it.
|
||||
*/
|
||||
if (fromguid == 0 && drba->drba_origin &&
|
||||
!(flags & DRR_FLAG_FREERECORDS))
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
/* Open the parent of tofs */
|
||||
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
|
||||
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
|
||||
@ -1394,7 +1397,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
|
||||
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
|
||||
fromguid != 0) {
|
||||
dsl_dataset_rele(origin, FTAG);
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
return (SET_ERROR(ENODEV));
|
||||
@ -1724,6 +1728,20 @@ struct receive_writer_arg {
|
||||
uint64_t bytes_read; /* bytes read when current record created */
|
||||
};
|
||||
|
||||
struct objlist {
|
||||
list_t list; /* List of struct receive_objnode. */
|
||||
/*
|
||||
* Last object looked up. Used to assert that objects are being looked
|
||||
* up in ascending order.
|
||||
*/
|
||||
uint64_t last_lookup;
|
||||
};
|
||||
|
||||
struct receive_objnode {
|
||||
list_node_t node;
|
||||
uint64_t object;
|
||||
};
|
||||
|
||||
struct receive_arg {
|
||||
objset_t *os;
|
||||
vnode_t *vp; /* The vnode to read the stream from */
|
||||
@ -1741,12 +1759,7 @@ struct receive_arg {
|
||||
int err;
|
||||
boolean_t byteswap;
|
||||
/* Sorted list of objects not to issue prefetches for. */
|
||||
list_t ignore_obj_list;
|
||||
};
|
||||
|
||||
struct receive_ign_obj_node {
|
||||
list_node_t node;
|
||||
uint64_t object;
|
||||
struct objlist ignore_objlist;
|
||||
};
|
||||
|
||||
typedef struct guid_map_entry {
|
||||
@ -2063,13 +2076,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
|
||||
struct drr_freeobjects *drrfo)
|
||||
{
|
||||
uint64_t obj;
|
||||
int next_err = 0;
|
||||
|
||||
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
|
||||
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
|
||||
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
|
||||
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
|
||||
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
|
||||
dmu_object_info_t doi;
|
||||
int err;
|
||||
|
||||
@ -2085,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
|
||||
if (err != 0)
|
||||
return (err);
|
||||
}
|
||||
|
||||
if (next_err != ESRCH)
|
||||
return (next_err);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -2415,6 +2430,70 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
objlist_create(struct objlist *list)
|
||||
{
|
||||
list_create(&list->list, sizeof (struct receive_objnode),
|
||||
offsetof(struct receive_objnode, node));
|
||||
list->last_lookup = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
objlist_destroy(struct objlist *list)
|
||||
{
|
||||
struct receive_objnode *n;
|
||||
|
||||
for (n = list_remove_head(&list->list);
|
||||
n != NULL; n = list_remove_head(&list->list)) {
|
||||
kmem_free(n, sizeof (*n));
|
||||
}
|
||||
list_destroy(&list->list);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function looks through the objlist to see if the specified object number
|
||||
* is contained in the objlist. In the process, it will remove all object
|
||||
* numbers in the list that are smaller than the specified object number. Thus,
|
||||
* any lookup of an object number smaller than a previously looked up object
|
||||
* number will always return false; therefore, all lookups should be done in
|
||||
* ascending order.
|
||||
*/
|
||||
static boolean_t
|
||||
objlist_exists(struct objlist *list, uint64_t object)
|
||||
{
|
||||
struct receive_objnode *node = list_head(&list->list);
|
||||
ASSERT3U(object, >=, list->last_lookup);
|
||||
list->last_lookup = object;
|
||||
while (node != NULL && node->object < object) {
|
||||
VERIFY3P(node, ==, list_remove_head(&list->list));
|
||||
kmem_free(node, sizeof (*node));
|
||||
node = list_head(&list->list);
|
||||
}
|
||||
return (node != NULL && node->object == object);
|
||||
}
|
||||
|
||||
/*
|
||||
* The objlist is a list of object numbers stored in ascending order. However,
|
||||
* the insertion of new object numbers does not seek out the correct location to
|
||||
* store a new object number; instead, it appends it to the list for simplicity.
|
||||
* Thus, any users must take care to only insert new object numbers in ascending
|
||||
* order.
|
||||
*/
|
||||
static void
|
||||
objlist_insert(struct objlist *list, uint64_t object)
|
||||
{
|
||||
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
|
||||
node->object = object;
|
||||
#ifdef ZFS_DEBUG
|
||||
{
|
||||
struct receive_objnode *last_object = list_tail(&list->list);
|
||||
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
|
||||
ASSERT3U(node->object, >, last_objnum);
|
||||
}
|
||||
#endif
|
||||
list_insert_tail(&list->list, node);
|
||||
}
|
||||
|
||||
/*
|
||||
* Issue the prefetch reads for any necessary indirect blocks.
|
||||
*
|
||||
@ -2437,13 +2516,7 @@ static void
|
||||
receive_read_prefetch(struct receive_arg *ra,
|
||||
uint64_t object, uint64_t offset, uint64_t length)
|
||||
{
|
||||
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
|
||||
while (node != NULL && node->object < object) {
|
||||
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
|
||||
kmem_free(node, sizeof (*node));
|
||||
node = list_head(&ra->ignore_obj_list);
|
||||
}
|
||||
if (node == NULL || node->object > object) {
|
||||
if (!objlist_exists(&ra->ignore_objlist, object)) {
|
||||
dmu_prefetch(ra->os, object, 1, offset, length,
|
||||
ZIO_PRIORITY_SYNC_READ);
|
||||
}
|
||||
@ -2476,20 +2549,7 @@ receive_read_record(struct receive_arg *ra)
|
||||
*/
|
||||
if (err == ENOENT ||
|
||||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
|
||||
struct receive_ign_obj_node *node =
|
||||
kmem_zalloc(sizeof (*node),
|
||||
KM_SLEEP);
|
||||
node->object = drro->drr_object;
|
||||
#ifdef ZFS_DEBUG
|
||||
{
|
||||
struct receive_ign_obj_node *last_object =
|
||||
list_tail(&ra->ignore_obj_list);
|
||||
uint64_t last_objnum = (last_object != NULL ?
|
||||
last_object->object : 0);
|
||||
ASSERT3U(node->object, >, last_objnum);
|
||||
}
|
||||
#endif
|
||||
list_insert_tail(&ra->ignore_obj_list, node);
|
||||
objlist_insert(&ra->ignore_objlist, drro->drr_object);
|
||||
err = 0;
|
||||
}
|
||||
return (err);
|
||||
@ -2706,7 +2766,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Read in the stream's records, one by one, and apply them to the pool. There
|
||||
* are two threads involved; the thread that calls this function will spin up a
|
||||
@ -2727,7 +2786,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
|
||||
struct receive_arg *ra;
|
||||
struct receive_writer_arg *rwa;
|
||||
int featureflags;
|
||||
struct receive_ign_obj_node *n;
|
||||
uint32_t payloadlen;
|
||||
void *payload;
|
||||
nvlist_t *begin_nvl = NULL;
|
||||
@ -2746,8 +2804,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
|
||||
sizeof (ra->bytes_read), 1, &ra->bytes_read);
|
||||
}
|
||||
|
||||
list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
|
||||
offsetof(struct receive_ign_obj_node, node));
|
||||
objlist_create(&ra->ignore_objlist);
|
||||
|
||||
/* these were verified in dmu_recv_begin */
|
||||
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
|
||||
@ -2901,12 +2958,7 @@ out:
|
||||
}
|
||||
|
||||
*voffp = ra->voff;
|
||||
|
||||
for (n = list_remove_head(&ra->ignore_obj_list); n != NULL;
|
||||
n = list_remove_head(&ra->ignore_obj_list)) {
|
||||
kmem_free(n, sizeof (*n));
|
||||
}
|
||||
list_destroy(&ra->ignore_obj_list);
|
||||
objlist_destroy(&ra->ignore_objlist);
|
||||
kmem_free(ra, sizeof (*ra));
|
||||
kmem_free(rwa, sizeof (*rwa));
|
||||
return (err);
|
||||
|
@ -152,7 +152,8 @@ tests = []
|
||||
[tests/functional/cli_root/zfs_receive]
|
||||
tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
|
||||
'zfs_receive_005_neg', 'zfs_receive_006_pos',
|
||||
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg']
|
||||
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
|
||||
'zfs_receive_010_pos']
|
||||
|
||||
# DISABLED:
|
||||
# zfs_rename_002_pos - needs investigation
|
||||
|
Loading…
Reference in New Issue
Block a user