From ea97f8ce35a8a515610e52b7e4744549f9c510f4 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 29 Jul 2013 10:58:53 -0800 Subject: [PATCH] Illumos #3834 3834 incremental replication of 'holey' file systems is slow Reviewed by: Adam Leventhal Reviewed by: George Wilson Approved by: Richard Lowe References: https://www.illumos.org/issues/3834 illumos/illumos-gate@ca48f36f20f6098ceb19d5b084b6b3d4b8eca9fa Ported-by: Richard Yao Signed-off-by: Brian Behlendorf Issue #1775 --- include/sys/dmu_impl.h | 6 +++++ include/sys/dmu_send.h | 3 ++- module/zfs/dbuf.c | 20 +++++++++++++++- module/zfs/dmu_send.c | 54 ++++++++++++++++++++++++++++++++++++++++-- module/zfs/dmu_tx.c | 48 ++++++++++++++++++++++++------------- 5 files changed, 110 insertions(+), 21 deletions(-) diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index f13a2a37c..bbff15df9 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -21,7 +21,10 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + */ +/* * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -265,6 +268,9 @@ typedef struct dmu_sendarg { uint64_t dsa_toguid; int dsa_err; dmu_pendop_t dsa_pending_op; + boolean_t dsa_incremental; + uint64_t dsa_last_data_object; + uint64_t dsa_last_data_offset; } dmu_sendarg_t; #ifdef __cplusplus diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index 6442b20f7..65514b762 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -63,5 +63,6 @@ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); #endif /* _DMU_SEND_H */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index cfa16b7c0..c1d0b294c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -846,9 +847,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen accross any level-1 dbufs in the + * empty blocks. Also, if we happen across any level-1 dbufs in the * range that have not already been marked dirty, mark them dirty so * they stay in memory. + * + * This is a no-op if the dataset is in the middle of an incremental + * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) @@ -864,6 +868,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) last_l1 = end >> epbs; } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + + if (dmu_objset_is_receiving(dn->dn_objset)) { + /* + * When processing a free record from a zfs receive, + * there should have been no previous modifications to the + * data in this range. Therefore there should be no dbufs + * in the range. Searching dn_dbufs for these non-existent + * dbufs can be very expensive, so simply ignore this. + */ + VERIFY3P(dbuf_find(dn, 0, start), ==, NULL); + VERIFY3P(dbuf_find(dn, 0, end), ==, NULL); + return; + } + mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index d38e2c512..f6df6b2d6 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -109,6 +109,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object+offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + /* + * If we are doing a non-incremental send, then there can't + * be any data in the dataset we're receiving into. Therefore + * a free record would simply be a no-op. Save space by not + * sending it to begin with. + */ + if (!dsp->dsa_incremental) + return (0); + if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -175,6 +201,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either @@ -242,6 +277,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + /* See comment in dump_free(). */ + if (!dsp->dsa_incremental) + return (0); + /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -318,9 +357,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (SET_ERROR(EINTR)); - /* free anything past the end of the file */ + /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); @@ -503,6 +542,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_incremental = (fromtxg != 0); mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -1799,3 +1839,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) else return (dmu_recv_existing_end(drc)); } + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index caac60193..7ee214bb0 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -604,8 +604,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t start, end, i; - int err, shift; + int err; zio_t *zio; ASSERT(tx->tx_txg == 0); @@ -616,30 +615,45 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) return; dn = txh->txh_dnode; - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + dmu_tx_count_dnode(txh); + /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * have already taken care of the level-0 blocks. + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + dmu_tx_count_write(txh, off, len); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off+len, 1); + } + + /* + * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + uint64_t i; + + ASSERT(dn->dn_datablkshift != 0); + ASSERT(dn->dn_indblkshift != 0); zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL);