/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; int zfs_send_queue_length = SPA_MAXBLOCKSIZE; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ int zfs_send_set_freerecords_bit = B_TRUE; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ unsigned long zfs_override_estimate_recordsize = 0; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; }; struct send_block_record { boolean_t eos_marker; /* Marks the end of the stream */ blkptr_t bp; zbookmark_phys_t zb; uint8_t indblkshift; uint16_t datablkszsec; bqueue_node_t ln; }; typedef struct dump_bytes_io { dmu_sendarg_t *dbi_dsp; void *dbi_buf; int dbi_len; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; dmu_sendarg_t *dsp = dbi->dbi_dsp; dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); ssize_t resid; /* have to get resid to get detailed errno */ /* * The code does not rely on len being a multiple of 8. We keep * this assertion because of the corresponding assertion in * receive_read(). Keeping this assertion ensures that we do not * inadvertently break backwards compatibility (causing the assertion * in receive_read() to trigger on old software). Newer feature flags * (such as raw send) may break this assertion since they were * introduced after the requirement was made obsolete. */ ASSERT(dbi->dbi_len % 8 == 0 || (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, (caddr_t)dbi->dbi_buf, dbi->dbi_len, 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); mutex_enter(&ds->ds_sendstream_lock); *dsp->dsa_off += dbi->dbi_len; mutex_exit(&ds->ds_sendstream_lock); } static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { dump_bytes_io_t dbi; dbi.dbi_dsp = dsp; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dsp->dsa_err); } /* * For all record types except BEGIN, fill in the checksum (overlaid in * drr_u.drr_checksum.drr_checksum). The checksum verifies everything * up to the start of the checksum itself. */ static int dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { dsp->dsa_sent_begin = B_TRUE; } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); } return (0); } /* * Fill in the drr_free struct, or perform aggregation if the previous record is * also a free record, and the two are adjacent. * * Note that we send free records even for a full send, because we want to be * able to receive a full send as a clone, which requires a list of all the free * and freeobject records that were generated on the source. */ static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is * DMU_OBJECT_END (because dump_dnode is the only place where * this function is called with a DMU_OBJECT_END, and only after * flushing any pending record). */ ASSERT(length != DMU_OBJECT_END); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { if (offset + length < offset) drrf->drr_length = DMU_OBJECT_END; else drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; if (offset + length < offset) drrf->drr_length = DMU_OBJECT_END; else drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == DMU_OBJECT_END) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; dsp->dsa_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a WRITE record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed or raw */ if (raw || lsize != psize) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3S(psize, >, 0); if (raw) { ASSERT(BP_IS_PROTECTED(bp)); /* * This is a raw protected block so we need to pass * along everything the receiving side will need to * interpret this block, including the byteswap, salt, * IV, and MAC. */ if (BP_SHOULD_BYTESWAP(bp)) drrw->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drrw->drr_salt, drrw->drr_iv); zio_crypt_decode_mac_bp(bp, drrw->drr_mac); } else { /* this is a compressed block */ ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); ASSERT3S(lsize, >=, psize); } /* set fields common to compressed and raw sends */ drrw->drr_compressiontype = BP_GET_COMPRESS(bp); drrw->drr_compressed_size = psize; payload_size = drrw->drr_compressed_size; } else { payload_size = drrw->drr_logical_size; } if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) { /* * There's no pre-computed checksum for partial-block writes, * embedded BP's, or encrypted BP's that are being sent as * plaintext, so (like fletcher4-checkummed blocks) userland * will have to compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & ZCHECKSUM_FLAG_DEDUP) drrw->drr_flags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); uint64_t blksz = BP_GET_LSIZE(bp); uint64_t payload_size = blksz; if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; /* handle raw send fields */ if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) drrs->drr_flags |= DRR_RAW_BYTESWAP; drrs->drr_compressiontype = BP_GET_COMPRESS(bp); drrs->drr_compressed_size = BP_GET_PSIZE(bp); zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv); zio_crypt_decode_mac_bp(bp, drrs->drr_mac); payload_size = drrs->drr_compressed_size; } if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); uint64_t maxobj = DNODES_PER_BLOCK * (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1); /* * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, * leading to zfs recv never completing. to avoid this issue, don't * send FREEOBJECTS records for object IDs which cannot exist on the * receiving side. */ if (maxobj > 0) { if (maxobj < firstobj) return (0); if (maxobj < firstobj + numobjs) numobjs = maxobj - firstobj; } /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dsp->dsa_toguid; dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); int bonuslen; if (object < dsp->dsa_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In * this case it's unnecessary to send the dnodes prior to * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ ASSERT3U(dsp->dsa_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) { ASSERT(BP_IS_ENCRYPTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) drro->drr_flags |= DRR_RAW_BYTESWAP; /* needed for reconstructing dnp on recv side */ drro->drr_maxblkid = dnp->dn_maxblkid; drro->drr_indblkshift = dnp->dn_indblkshift; drro->drr_nlevels = dnp->dn_nlevels; drro->drr_nblkptr = dnp->dn_nblkptr; /* * Since we encrypt the entire bonus area, the (raw) part * beyond the bonuslen is actually nonzero, so we need * to send it. */ if (bonuslen != 0) { drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); bonuslen = drro->drr_raw_bonuslen; } } if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj, uint64_t numslots) { struct drr_object_range *drror = &(dsp->dsa_drr->drr_u.drr_object_range); /* we only use this record type for raw sends */ ASSERT(BP_IS_PROTECTED(bp)); ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); ASSERT0(BP_GET_LEVEL(bp)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE; drror->drr_firstobj = firstobj; drror->drr_numslots = numslots; drror->drr_toguid = dsp->dsa_toguid; if (BP_SHOULD_BYTESWAP(bp)) drror->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); zio_crypt_decode_mac_bp(bp, drror->drr_mac); if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } /* * This is the callback function to traverse_dataset that acts as the worker * thread for dmu_send_impl. */ /*ARGSUSED*/ static int send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { struct send_thread_arg *sta = arg; struct send_block_record *record; uint64_t record_size; int err = 0; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); ASSERT3P(sta->ds, !=, NULL); if (sta->cancel) return (SET_ERROR(EINTR)); if (bp == NULL) { ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); return (0); } else if (zb->zb_level < 0) { return (0); } record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); record->eos_marker = B_FALSE; record->bp = *bp; record->zb = *zb; record->indblkshift = dnp->dn_indblkshift; record->datablkszsec = dnp->dn_datablkszsec; record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; bqueue_enqueue(&sta->q, record, record_size); return (err); } /* * This function kicks off the traverse_dataset. It also handles setting the * error code of the thread in case something goes wrong, and pushes the End of * Stream record when the traverse_dataset call has finished. If there is no * dataset to traverse, the thread immediately pushes End of Stream marker. */ static void send_traverse_thread(void *arg) { struct send_thread_arg *st_arg = arg; int err; struct send_block_record *data; fstrans_cookie_t cookie = spl_fstrans_mark(); if (st_arg->ds != NULL) { err = traverse_dataset_resume(st_arg->ds, st_arg->fromtxg, &st_arg->resume, st_arg->flags, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; } data = kmem_zalloc(sizeof (*data), KM_SLEEP); data->eos_marker = B_TRUE; bqueue_enqueue(&st_arg->q, data, 1); spl_fstrans_unmark(cookie); thread_exit(); } /* * This function actually handles figuring out what kind of record needs to be * dumped, reading the data (which has hopefully been prefetched), and calling * the appropriate helper function. */ static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); const blkptr_t *bp = &data->bp; const zbookmark_phys_t *zb = &data->zb; uint8_t indblkshift = data->indblkshift; uint16_t dblkszsec = data->datablkszsec; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; ASSERT3U(zb->zb_level, >=, 0); ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= dsa->dsa_resume_object); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ if (dsa->dsa_os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", ds->ds_object); return (SET_ERROR(EIO)); } if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t offset = zb->zb_blkid * span; /* Don't dump free records for offsets > DMU_OBJECT_END */ if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid) err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; enum zio_flag zioflags = ZIO_FLAG_CANFAIL; if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_ENCRYPTED(bp)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); zioflags |= ZIO_FLAG_RAW; } ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) return (SET_ERROR(EIO)); dnode_phys_t *blk = abuf->b_data; uint64_t dnobj = zb->zb_blkid * epb; /* * Raw sends require sending encryption parameters for the * block of dnodes. Regular sends do not need to send this * info. */ if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(arc_is_encrypted(abuf)); err = dump_object_range(dsa, bp, dnobj, epb); } if (err == 0) { for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { err = dump_dnode(dsa, bp, dnobj + i, blk + i); if (err != 0) break; } } arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; enum zio_flag zioflags = ZIO_FLAG_CANFAIL; if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); zioflags |= ZIO_FLAG_RAW; } if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data); arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; ASSERT0(zb->zb_level); err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from * the arc buf into chunks. */ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); /* * Raw sends require that we always get raw data as it exists * on disk, so we assert that we are not splitting blocks here. */ boolean_t request_raw = (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; /* * We should only request compressed data from the ARC if all * the following are true: * - stream compression was requested * - we aren't splitting large blocks into smaller chunks * - the data won't need to be byteswapped before sending * - this isn't an embedded block * - this isn't metadata (if receiving on a different endian * system it can be byteswapped more easily) */ boolean_t request_compressed = (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); IMPLY(request_raw, !split_large_blocks); IMPLY(request_raw, BP_IS_PROTECTED(bp)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); enum zio_flag zioflags = ZIO_FLAG_CANFAIL; if (request_raw) zioflags |= ZIO_FLAG_RAW; else if (request_compressed) zioflags |= ZIO_FLAG_RAW_COMPRESS; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, blksz); uint64_t *ptr; for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } offset = zb->zb_blkid * blksz; if (split_large_blocks) { ASSERT0(arc_is_encrypted(abuf)); ASSERT3U(arc_get_compression(abuf), ==, ZIO_COMPRESS_OFF); char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsa, type, zb->zb_object, offset, n, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { err = dump_write(dsa, type, zb->zb_object, offset, blksz, arc_buf_size(abuf), bp, abuf->b_data); } arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } /* * Pop the new data off the queue, and free the old data. */ static struct send_block_record * get_next_record(bqueue_t *bq, struct send_block_record *data) { struct send_block_record *tmp = bqueue_dequeue(bq); kmem_free(data, sizeof (*data)); return (tmp); } /* * Actually do the bulk of the work in a zfs send. * * Note: Releases dp using the specified tag. */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; struct send_thread_arg to_arg; void *payload = NULL; size_t payload_len = 0; struct send_block_record *to_data; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } /* * If this is a non-raw send of an encrypted ds, we can ensure that * the objset_phys_t is authenticated. This is safe because this is * either a snapshot or we have owned the dataset, ensuring that * it can't be modified. */ if (!rawok && os->os_encrypted && arc_is_unauthenticated(os->os_phys_buf)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = arc_untransform(os->os_phys_buf, os->os_spa, &zb, B_FALSE); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); bzero(&to_arg, sizeof (to_arg)); #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif /* raw sends imply large_block_ok */ if ((large_block_ok || rawok) && dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; /* encrypted datasets will not have embedded blocks */ if ((embedok || rawok) && !os->os_encrypted && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; } /* raw send implies compressok */ if (compressok || rawok) featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; if (rawok && os->os_encrypted) featureflags |= DMU_BACKUP_FEATURE_RAW; if ((featureflags & (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (resumeobj != 0 || resumeoff != 0) { featureflags |= DMU_BACKUP_FEATURE_RESUMING; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (zfs_send_set_freerecords_bit) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = ancestor_zb->zbm_guid; fromtxg = ancestor_zb->zbm_creation_txg; } dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); dsp->dsa_drr = drr; dsp->dsa_vp = vp; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_featureflags = featureflags; dsp->dsa_resume_object = resumeobj; dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); /* handle features that require a DRR_BEGIN payload */ if (featureflags & (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) { nvlist_t *keynvl = NULL; nvlist_t *nvl = fnvlist_alloc(); if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { dmu_object_info_t to_doi; err = dmu_object_info(os, resumeobj, &to_doi); if (err != 0) { fnvlist_free(nvl); goto out; } SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, resumeoff / to_doi.doi_data_block_size); fnvlist_add_uint64(nvl, "resume_object", resumeobj); fnvlist_add_uint64(nvl, "resume_offset", resumeoff); } if (featureflags & DMU_BACKUP_FEATURE_RAW) { uint64_t ivset_guid = (ancestor_zb != NULL) ? ancestor_zb->zbm_ivset_guid : 0; ASSERT(os->os_encrypted); err = dsl_crypto_populate_key_nvlist(to_ds, ivset_guid, &keynvl); if (err != 0) { fnvlist_free(nvl); goto out; } fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); } payload = fnvlist_pack(nvl, &payload_len); drr->drr_payloadlen = payload_len; fnvlist_free(keynvl); fnvlist_free(nvl); } err = dump_record(dsp, payload, payload_len); fnvlist_pack_free(payload, payload_len); if (err != 0) { err = dsp->dsa_err; goto out; } err = bqueue_init(&to_arg.q, MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_block_record, ln)); to_arg.error_code = 0; to_arg.cancel = B_FALSE; to_arg.ds = to_ds; to_arg.fromtxg = fromtxg; to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; if (rawok) to_arg.flags |= TRAVERSE_NO_DECRYPT; (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, TS_RUN, minclsyspri); to_data = bqueue_dequeue(&to_arg.q); while (!to_data->eos_marker && err == 0) { err = do_dump(dsp, to_data); to_data = get_next_record(&to_arg.q, to_data); if (issig(JUSTLOOKING) && issig(FORREAL)) err = EINTR; } if (err != 0) { to_arg.cancel = B_TRUE; while (!to_data->eos_marker) { to_data = get_next_record(&to_arg.q, to_data); } } kmem_free(to_data, sizeof (*to_data)); bqueue_destroy(&to_arg.q); if (err == 0 && to_arg.error_code != 0) err = to_arg.error_code; if (err != 0) goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; out: mutex_enter(&to_ds->ds_sendstream_lock); list_remove(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); dsl_dataset_long_rele(to_ds, FTAG); return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { zfs_bookmark_phys_t zb = { 0 }; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); dsl_pool_rele(dp, FTAG); return (err); } if (!dsl_dataset_is_before(ds, fromds, 0)) { err = SET_ERROR(EXDEV); dsl_dataset_rele(fromds, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); dsl_pool_rele(dp, FTAG); return (err); } zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; if (dsl_dataset_is_zapified(fromds)) { (void) zap_lookup(dp->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &zb.zbm_ivset_guid); } is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, rawok, outfd, 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, rawok, outfd, 0, 0, vp, off); } dsl_dataset_rele_flags(ds, dsflags, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds); owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t zb = { 0 }; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { is_clone = B_TRUE; } if (strchr(fromsnap, '@')) { dsl_dataset_t *fromds; err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); if (err == 0) { if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); if (dsl_dataset_is_zapified(fromds)) { (void) zap_lookup(dp->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &zb.zbm_ivset_guid); } dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { if (owned) dsl_dataset_disown(ds, dsflags, FTAG); else dsl_dataset_rele_flags(ds, dsflags, FTAG); dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, rawok, outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, rawok, outfd, resumeobj, resumeoff, vp, off); } if (owned) dsl_dataset_disown(ds, dsflags, FTAG); else dsl_dataset_rele_flags(ds, dsflags, FTAG); return (err); } static int dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err = 0; uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ uint64_t recordsize; uint64_t record_count; objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); /* Assume all (uncompressed) blocks are recordsize. */ if (zfs_override_estimate_recordsize != 0) { recordsize = zfs_override_estimate_recordsize; } else if (os->os_phys->os_type == DMU_OST_ZVOL) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); } else { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); } if (err != 0) return (err); record_count = uncompressed / recordsize; /* * If we're estimating a send size for a compressed stream, use the * compressed data size to estimate the stream size. Otherwise, use the * uncompressed data size. */ size = stream_compressed ? compressed : uncompressed; /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block. */ size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += record_count * sizeof (dmu_replay_record_t); *sizep = size; return (0); } int dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, boolean_t stream_compressed, uint64_t *sizep) { int err; uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ if (fromds != NULL && !fromds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); /* Get compressed and uncompressed size estimates of changed data. */ if (fromds == NULL) { uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } else { uint64_t used; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &uncomp); if (err != 0) return (err); } err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, stream_compressed, sizep); /* * Add the size of the BEGIN and END records to the estimate. */ *sizep += 2 * sizeof (dmu_replay_record_t); return (err); } struct calculate_send_arg { uint64_t uncompressed; uint64_t compressed; }; /* * Simple callback used to traverse the blocks of a snapshot and sum their * uncompressed and compressed sizes. */ /* ARGSUSED */ static int dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct calculate_send_arg *space = arg; if (bp != NULL && !BP_IS_HOLE(bp)) { space->uncompressed += BP_GET_UCSIZE(bp); space->compressed += BP_GET_PSIZE(bp); } return (0); } /* * Given a desination snapshot and a TXG, calculate the approximate size of a * send stream sent from that TXG. from_txg may be zero, indicating that the * whole snapshot will be sent. */ int dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, boolean_t stream_compressed, uint64_t *sizep) { int err; struct calculate_send_arg size = { 0 }; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); /* verify that from_txg is before the provided snapshot was taken */ if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { return (SET_ERROR(EXDEV)); } /* * traverse the blocks of the snapshot with birth times after * from_txg, summing their uncompressed size */ err = traverse_dataset(ds, from_txg, TRAVERSE_POST | TRAVERSE_NO_DECRYPT, dmu_calculate_send_traversal, &size); if (err) return (err); err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, size.compressed, stream_compressed, sizep); return (err); } #if defined(_KERNEL) /* BEGIN CSTYLED */ module_param(zfs_override_estimate_recordsize, ulong, 0644); MODULE_PARM_DESC(zfs_override_estimate_recordsize, "Record size calculation override for zfs send estimates"); /* END CSTYLED */ module_param(zfs_send_corrupt_data, int, 0644); MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); module_param(zfs_send_queue_length, int, 0644); MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length"); #endif