mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	ZFS send should use spill block prefetched from send_reader_thread
Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Co-authored-by: david.chen <david.chen@nutanix.com> Closes #16701
This commit is contained in:
		
							parent
							
								
									7b6e9675da
								
							
						
					
					
						commit
						5945676bcc
					
				@ -180,6 +180,8 @@ struct send_range {
 | 
			
		||||
			 */
 | 
			
		||||
			dnode_phys_t		*dnp;
 | 
			
		||||
			blkptr_t		bp;
 | 
			
		||||
			/* Piggyback unmodified spill block */
 | 
			
		||||
			struct send_range	*spill_range;
 | 
			
		||||
		} object;
 | 
			
		||||
		struct srr {
 | 
			
		||||
			uint32_t		datablksz;
 | 
			
		||||
@ -231,6 +233,8 @@ range_free(struct send_range *range)
 | 
			
		||||
		size_t size = sizeof (dnode_phys_t) *
 | 
			
		||||
		    (range->sru.object.dnp->dn_extra_slots + 1);
 | 
			
		||||
		kmem_free(range->sru.object.dnp, size);
 | 
			
		||||
		if (range->sru.object.spill_range)
 | 
			
		||||
			range_free(range->sru.object.spill_range);
 | 
			
		||||
	} else if (range->type == DATA) {
 | 
			
		||||
		mutex_enter(&range->sru.data.lock);
 | 
			
		||||
		while (range->sru.data.io_outstanding)
 | 
			
		||||
@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 | 
			
		||||
	drrs->drr_length = blksz;
 | 
			
		||||
	drrs->drr_toguid = dscp->dsc_toguid;
 | 
			
		||||
 | 
			
		||||
	/* See comment in dump_dnode() for full details */
 | 
			
		||||
	/* See comment in piggyback_unmodified_spill() for full details */
 | 
			
		||||
	if (zfs_send_unmodified_spill_blocks &&
 | 
			
		||||
	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 | 
			
		||||
		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 | 
			
		||||
@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 | 
			
		||||
	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
 | 
			
		||||
		return (SET_ERROR(EINTR));
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful
 | 
			
		||||
	 * because changing certain attributes of the object (e.g. blocksize)
 | 
			
		||||
	 * can cause old versions of ZFS to incorrectly remove a spill block.
 | 
			
		||||
	 * Including these records in the stream forces an up to date version
 | 
			
		||||
	 * to always be written ensuring they're never lost.  Current versions
 | 
			
		||||
	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
 | 
			
		||||
	 * ignore these unmodified spill blocks.
 | 
			
		||||
	 */
 | 
			
		||||
	if (zfs_send_unmodified_spill_blocks &&
 | 
			
		||||
	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
 | 
			
		||||
	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
 | 
			
		||||
		struct send_range record;
 | 
			
		||||
		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 | 
			
		||||
 | 
			
		||||
		memset(&record, 0, sizeof (struct send_range));
 | 
			
		||||
		record.type = DATA;
 | 
			
		||||
		record.object = object;
 | 
			
		||||
		record.eos_marker = B_FALSE;
 | 
			
		||||
		record.start_blkid = DMU_SPILL_BLKID;
 | 
			
		||||
		record.end_blkid = record.start_blkid + 1;
 | 
			
		||||
		record.sru.data.bp = *bp;
 | 
			
		||||
		record.sru.data.obj_type = dnp->dn_type;
 | 
			
		||||
		record.sru.data.datablksz = BP_GET_LSIZE(bp);
 | 
			
		||||
 | 
			
		||||
		if (do_dump(dscp, &record) != 0)
 | 
			
		||||
			return (SET_ERROR(EINTR));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (dscp->dsc_err != 0)
 | 
			
		||||
		return (SET_ERROR(EINTR));
 | 
			
		||||
 | 
			
		||||
@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 | 
			
		||||
	case OBJECT:
 | 
			
		||||
		err = dump_dnode(dscp, &range->sru.object.bp, range->object,
 | 
			
		||||
		    range->sru.object.dnp);
 | 
			
		||||
		/* Dump piggybacked unmodified spill block */
 | 
			
		||||
		if (!err && range->sru.object.spill_range)
 | 
			
		||||
			err = do_dump(dscp, range->sru.object.spill_range);
 | 
			
		||||
		return (err);
 | 
			
		||||
	case OBJECT_RANGE: {
 | 
			
		||||
		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 | 
			
		||||
@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 | 
			
		||||
 | 
			
		||||
		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
 | 
			
		||||
		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
 | 
			
		||||
		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
 | 
			
		||||
			arc_flags_t aflags = ARC_FLAG_WAIT;
 | 
			
		||||
			zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 | 
			
		||||
 | 
			
		||||
			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 | 
			
		||||
				ASSERT(BP_IS_PROTECTED(bp));
 | 
			
		||||
				zioflags |= ZIO_FLAG_RAW;
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			zbookmark_phys_t zb;
 | 
			
		||||
			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
 | 
			
		||||
			zb.zb_objset = dmu_objset_id(dscp->dsc_os);
 | 
			
		||||
			zb.zb_object = range->object;
 | 
			
		||||
			zb.zb_level = 0;
 | 
			
		||||
			zb.zb_blkid = range->start_blkid;
 | 
			
		||||
 | 
			
		||||
			arc_buf_t *abuf = NULL;
 | 
			
		||||
			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
 | 
			
		||||
			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 | 
			
		||||
			    zioflags, &aflags, &zb) != 0)
 | 
			
		||||
				return (SET_ERROR(EIO));
 | 
			
		||||
 | 
			
		||||
			err = dump_spill(dscp, bp, zb.zb_object,
 | 
			
		||||
			    (abuf == NULL ? NULL : abuf->b_data));
 | 
			
		||||
			if (abuf != NULL)
 | 
			
		||||
				arc_buf_destroy(abuf, &abuf);
 | 
			
		||||
			return (err);
 | 
			
		||||
		}
 | 
			
		||||
		if (send_do_embed(bp, dscp->dsc_featureflags)) {
 | 
			
		||||
			err = dump_write_embedded(dscp, range->object,
 | 
			
		||||
			    range->start_blkid * srdp->datablksz,
 | 
			
		||||
@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 | 
			
		||||
		}
 | 
			
		||||
		ASSERT(range->object > dscp->dsc_resume_object ||
 | 
			
		||||
		    (range->object == dscp->dsc_resume_object &&
 | 
			
		||||
		    (range->start_blkid == DMU_SPILL_BLKID ||
 | 
			
		||||
		    range->start_blkid * srdp->datablksz >=
 | 
			
		||||
		    dscp->dsc_resume_offset));
 | 
			
		||||
		    dscp->dsc_resume_offset)));
 | 
			
		||||
		/* it's a level-0 block of a regular object */
 | 
			
		||||
 | 
			
		||||
		mutex_enter(&srdp->lock);
 | 
			
		||||
@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 | 
			
		||||
		ASSERT(dscp->dsc_dso->dso_dryrun ||
 | 
			
		||||
		    srdp->abuf != NULL || srdp->abd != NULL);
 | 
			
		||||
 | 
			
		||||
		uint64_t offset = range->start_blkid * srdp->datablksz;
 | 
			
		||||
 | 
			
		||||
		char *data = NULL;
 | 
			
		||||
		if (srdp->abd != NULL) {
 | 
			
		||||
			data = abd_to_buf(srdp->abd);
 | 
			
		||||
@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 | 
			
		||||
			data = srdp->abuf->b_data;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
 | 
			
		||||
			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
 | 
			
		||||
			err = dump_spill(dscp, bp, range->object, data);
 | 
			
		||||
			return (err);
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		uint64_t offset = range->start_blkid * srdp->datablksz;
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * If we have large blocks stored on disk but the send flags
 | 
			
		||||
		 * don't allow us to send large blocks, we split the data from
 | 
			
		||||
@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
 | 
			
		||||
		range->sru.data.io_outstanding = 0;
 | 
			
		||||
		range->sru.data.io_err = 0;
 | 
			
		||||
		range->sru.data.io_compressed = B_FALSE;
 | 
			
		||||
	} else if (type == OBJECT) {
 | 
			
		||||
		range->sru.object.spill_range = NULL;
 | 
			
		||||
	}
 | 
			
		||||
	return (range);
 | 
			
		||||
}
 | 
			
		||||
@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
 | 
			
		||||
	bqueue_enqueue(q, range, datablksz);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Send DRR_SPILL records for unmodified spill blocks.	This is useful
 | 
			
		||||
 * because changing certain attributes of the object (e.g. blocksize)
 | 
			
		||||
 * can cause old versions of ZFS to incorrectly remove a spill block.
 | 
			
		||||
 * Including these records in the stream forces an up to date version
 | 
			
		||||
 * to always be written ensuring they're never lost.  Current versions
 | 
			
		||||
 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
 | 
			
		||||
 * ignore these unmodified spill blocks.
 | 
			
		||||
 *
 | 
			
		||||
 * We piggyback the spill_range to dnode range instead of enqueueing it
 | 
			
		||||
 * so send_range_after won't complain.
 | 
			
		||||
 */
 | 
			
		||||
static uint64_t
 | 
			
		||||
piggyback_unmodified_spill(struct send_reader_thread_arg *srta,
 | 
			
		||||
    struct send_range *range)
 | 
			
		||||
{
 | 
			
		||||
	ASSERT3U(range->type, ==, OBJECT);
 | 
			
		||||
 | 
			
		||||
	dnode_phys_t *dnp = range->sru.object.dnp;
 | 
			
		||||
	uint64_t fromtxg = srta->smta->to_arg->fromtxg;
 | 
			
		||||
 | 
			
		||||
	if (!zfs_send_unmodified_spill_blocks ||
 | 
			
		||||
	    !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ||
 | 
			
		||||
	    !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg))
 | 
			
		||||
		return (0);
 | 
			
		||||
 | 
			
		||||
	blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 | 
			
		||||
	struct send_range *spill_range = range_alloc(DATA, range->object,
 | 
			
		||||
	    DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE);
 | 
			
		||||
	spill_range->sru.data.bp = *bp;
 | 
			
		||||
	spill_range->sru.data.obj_type = dnp->dn_type;
 | 
			
		||||
	spill_range->sru.data.datablksz = BP_GET_LSIZE(bp);
 | 
			
		||||
 | 
			
		||||
	issue_data_read(srta, spill_range);
 | 
			
		||||
	range->sru.object.spill_range = spill_range;
 | 
			
		||||
 | 
			
		||||
	return (BP_GET_LSIZE(bp));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * This thread is responsible for two things: First, it retrieves the correct
 | 
			
		||||
 * blkptr in the to ds if we need to send the data because of something from
 | 
			
		||||
@ -1773,17 +1772,20 @@ send_reader_thread(void *arg)
 | 
			
		||||
	uint64_t last_obj_exists = B_TRUE;
 | 
			
		||||
	while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
 | 
			
		||||
	    err == 0) {
 | 
			
		||||
		uint64_t spill = 0;
 | 
			
		||||
		switch (range->type) {
 | 
			
		||||
		case DATA:
 | 
			
		||||
			issue_data_read(srta, range);
 | 
			
		||||
			bqueue_enqueue(outq, range, range->sru.data.datablksz);
 | 
			
		||||
			range = get_next_range_nofree(inq, range);
 | 
			
		||||
			break;
 | 
			
		||||
		case HOLE:
 | 
			
		||||
		case OBJECT:
 | 
			
		||||
			spill = piggyback_unmodified_spill(srta, range);
 | 
			
		||||
			zfs_fallthrough;
 | 
			
		||||
		case HOLE:
 | 
			
		||||
		case OBJECT_RANGE:
 | 
			
		||||
		case REDACT: // Redacted blocks must exist
 | 
			
		||||
			bqueue_enqueue(outq, range, sizeof (*range));
 | 
			
		||||
			bqueue_enqueue(outq, range, sizeof (*range) + spill);
 | 
			
		||||
			range = get_next_range_nofree(inq, range);
 | 
			
		||||
			break;
 | 
			
		||||
		case PREVIOUSLY_REDACTED: {
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user