Illumos 4370, 4371

4370 avoid transmitting holes during zfs send
4371 DMU code clean up

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>a

References:
  https://www.illumos.org/issues/4370
  https://www.illumos.org/issues/4371
  https://github.com/illumos/illumos-gate/commit/43466aa

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2529
This commit is contained in:
Max Grossman
2013-12-09 10:37:51 -08:00
committed by Brian Behlendorf
parent fa86b5dbb6
commit b0bc7a84d9
32 changed files with 690 additions and 385 deletions
+53 -82
View File
@@ -511,10 +511,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
DB_GET_SPA(&spa, db);
abuf = arc_loan_buf(spa, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
@@ -575,7 +574,6 @@ static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
dnode_t *dn;
spa_t *spa;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
@@ -615,9 +613,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
DB_DNODE_EXIT(db);
dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -625,7 +623,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
return;
}
spa = dn->dn_objset->os_spa;
DB_DNODE_EXIT(db);
db->db_state = DB_READ;
@@ -642,7 +639,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_add_ref(db, NULL);
(void) arc_read(zio, spa, db->db_blkptr,
(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -654,8 +651,8 @@ int
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
int havepzio = (zio != NULL);
int prefetch;
boolean_t havepzio = (zio != NULL);
boolean_t prefetch;
dnode_t *dn;
/*
@@ -750,11 +747,10 @@ dbuf_noread(dmu_buf_impl_t *db)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
DB_GET_SPA(&spa, db);
dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
@@ -809,9 +805,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa;
spa_t *spa = db->db_objset->os_spa;
DB_GET_SPA(&spa, db);
dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
@@ -837,12 +832,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
ASSERT(db->db_data_pending != dr);
/* free this block */
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
spa_t *spa;
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
DB_GET_SPA(&spa, db);
zio_free(spa, txg, bp);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
@@ -860,9 +852,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/*
* Evict (if its unreferenced) or clear (if its referenced) any level-0
* data blocks in the free range, so that any future readers will find
* empty blocks. Also, if we happen across any level-1 dbufs in the
* range that have not already been marked dirty, mark them dirty so
* they stay in memory.
* empty blocks.
*
* This is a no-op if the dataset is in the middle of an incremental
* receive; see comment below for details.
@@ -872,14 +862,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t first_l1 = start >> epbs;
uint64_t last_l1 = end >> epbs;
if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
end = dn->dn_maxblkid;
last_l1 = end >> epbs;
}
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
@@ -902,23 +887,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level == 1 &&
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
mutex_enter(&db->db_mtx);
if (db->db_last_dirty &&
db->db_last_dirty->dr_txg < txg) {
dbuf_add_ref(db, FTAG);
mutex_exit(&db->db_mtx);
dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
} else {
mutex_exit(&db->db_mtx);
}
}
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
if (db->db_blkid < start || db->db_blkid > end)
continue;
@@ -995,24 +965,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*
* This logic ensures that only block births for
* filled blocks are considered.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_last_dirty)
if (db->db_last_dirty && (db->db_blkptr == NULL ||
!BP_IS_HOLE(db->db_blkptr))) {
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
birth_txg = db->db_blkptr->blk_birth;
}
/*
* If we don't exist or are in a snapshot, we can't be freed.
* If this block don't exist or is in a snapshot, it can't be freed.
* Don't pass the bp to dsl_dataset_block_freeable() since we
* are holding the db_mtx lock and might deadlock if we are
* prefetching a dedup-ed block.
*/
if (birth_txg)
if (birth_txg != 0)
return (ds == NULL ||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
return (FALSE);
return (B_FALSE);
}
void
@@ -1032,7 +1007,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
@@ -1041,7 +1016,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
*/
dbuf_will_dirty(db, tx);
dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
@@ -1071,9 +1046,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
objset_t *os;
ASSERTV(objset_t *os = db->db_objset);
DB_GET_OBJSET(&os, db);
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -1448,10 +1422,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
}
#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
void
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
@@ -1574,7 +1548,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
db->db_state = DB_FILL;
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
dbuf_fill_done(db, tx);
dmu_buf_fill_done(&db->db, tx);
}
/*
@@ -2132,7 +2106,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
* dnode's parent dbuf evicting its dnode handles.
*/
#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
@@ -2140,6 +2113,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_rele_and_unlock(db, tag);
}
void
dmu_buf_rele(dmu_buf_t *db, void *tag)
{
dbuf_rele((dmu_buf_impl_t *)db, tag);
}
/*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
* db_dirtycnt and db_holds to be updated atomically.
@@ -2600,18 +2579,14 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
ASSERT(bp->blk_fill == 0);
DB_DNODE_EXIT(db);
return;
if (bp->blk_birth != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
mutex_enter(&db->db_mtx);
#ifdef ZFS_DEBUG
@@ -2637,7 +2612,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill++;
}
} else {
fill = 1;
if (BP_IS_HOLE(bp)) {
fill = 0;
} else {
fill = 1;
}
}
} else {
blkptr_t *ibp = db->db.db_data;
@@ -2692,9 +2671,10 @@ static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t txg = zio->io_txg;
blkptr_t *bp = db->db_blkptr;
objset_t *os = db->db_objset;
dmu_tx_t *tx = os->os_synctx;
dbuf_dirty_record_t **drp, *dr;
ASSERT0(zio->io_error);
@@ -2707,14 +2687,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
objset_t *os;
dsl_dataset_t *ds;
dmu_tx_t *tx;
DB_GET_OBJSET(&os, db);
ds = os->os_dsl_dataset;
tx = os->os_synctx;
dsl_dataset_t *ds = os->os_dsl_dataset;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
@@ -2727,7 +2700,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
@@ -2761,14 +2733,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
SPA_BLKPTRSHIFT);
ASSERT3U(db->db_blkid, <=,
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
ASSERT3U(dn->dn_phys->dn_maxblkid
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
@@ -2781,8 +2753,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
}
static void