Illumos 4370, 4371

4370 avoid transmitting holes during zfs send
4371 DMU code clean up

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>a

References:
  https://www.illumos.org/issues/4370
  https://www.illumos.org/issues/4371
  https://github.com/illumos/illumos-gate/commit/43466aa

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2529
This commit is contained in:
Max Grossman
2013-12-09 10:37:51 -08:00
committed by Brian Behlendorf
parent fa86b5dbb6
commit b0bc7a84d9
32 changed files with 690 additions and 385 deletions
+102 -88
View File
@@ -32,6 +32,7 @@
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/zfeature.h>
static void
dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
@@ -112,26 +113,48 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
rw_exit(&dn->dn_struct_rwlock);
}
static int
static void
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
int i, blocks_freed = 0;
int i;
dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
for (i = 0; i < num; i++, bp++) {
uint64_t lsize, lvl;
dmu_object_type_t type;
if (BP_IS_HOLE(bp))
continue;
bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
/*
* Save some useful information on the holes being
* punched, including logical size, type, and indirection
* level. Retaining birth time enables detection of when
* holes are punched for reducing the number of free
* records transmitted during a zfs send.
*/
lsize = BP_GET_LSIZE(bp);
type = BP_GET_TYPE(bp);
lvl = BP_GET_LEVEL(bp);
bzero(bp, sizeof (blkptr_t));
blocks_freed += 1;
if (spa_feature_is_active(dn->dn_objset->os_spa,
SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, type);
BP_SET_LEVEL(bp, lvl);
BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
}
}
dnode_diduse_space(dn, -bytesfreed);
return (blocks_freed);
}
#ifdef ZFS_DEBUG
@@ -215,30 +238,27 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
#define ALL -1
static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
int all = TRUE;
int blocks_freed = 0;
int epbs, shift;
/*
* There is a small possibility that this block will not be cached:
* 1 - if level > 1 and there are no children with level <= 1
* 2 - if we didn't get a dirty hold (because this block had just
* finished being written -- and so had no holds), and then this
* block got evicted before we got here.
* 2 - if this block was evicted since we read it from
* dmu_tx_hold_free().
*/
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
dbuf_release_bp(db);
bp = (blkptr_t *)db->db.db_data;
bp = db->db.db_data;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -248,7 +268,6 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
start = blkid >> shift;
if (dbstart < start) {
bp += start - dbstart;
all = FALSE;
} else {
start = dbstart;
}
@@ -256,49 +275,46 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
end = (blkid + nblks - 1) >> shift;
if (dbend <= end)
end = dbend;
else if (all)
all = trunc;
ASSERT3U(start, <=, end);
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
DB_DNODE_EXIT(db);
return (all ? ALL : blocks_freed);
}
free_blocks(dn, bp, end-start+1, tx);
} else {
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
i, B_TRUE, FTAG, &subdb));
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
ASSERT0(err);
rw_exit(&dn->dn_struct_rwlock);
if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(subdb->db_blkptr, ==, bp);
blocks_freed += free_blocks(dn, bp, 1, tx);
} else {
all = FALSE;
free_children(subdb, blkid, nblks, tx);
dbuf_rele(subdb, FTAG);
}
dbuf_rele(subdb, FTAG);
}
/* If this whole block is free, free ourself too. */
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
if (!BP_IS_HOLE(bp))
break;
}
if (i == 1 << epbs) {
/* didn't find any non-holes */
bzero(db->db.db_data, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx);
} else {
/*
* Partial block free; must be marked dirty so that it
* will be written out.
*/
ASSERT(db->db_dirtycnt > 0);
}
DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
for (i = start; i <= end; i++, bp++) {
if (i == start && blkid != 0)
continue;
else if (i == end && !trunc)
continue;
ASSERT0(bp->blk_birth);
}
#endif
ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
return (all ? ALL : blocks_freed);
}
/*
@@ -306,20 +322,21 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
* and "free" all the blocks contained there.
*/
static void
dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
dmu_buf_impl_t *db;
int trunc, start, end, shift, i, err;
int dnlevel = dn->dn_phys->dn_nlevels;
boolean_t trunc = B_FALSE;
if (blkid > dn->dn_phys->dn_maxblkid)
return;
ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
if (trunc)
if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
trunc = B_TRUE;
}
/* There are no indirect blocks in the object */
if (dnlevel == 1) {
@@ -328,41 +345,36 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
(void) free_blocks(dn, bp + blkid, nblks, tx);
if (trunc) {
ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec<<SPA_MINBLOCKSHIFT));
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
free_blocks(dn, bp + blkid, nblks, tx);
} else {
int shift = (dnlevel - 1) *
(dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
int start = blkid >> shift;
int end = (blkid + nblks - 1) >> shift;
dmu_buf_impl_t *db;
int i;
ASSERT(start < dn->dn_phys->dn_nblkptr);
bp += start;
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, tx);
dbuf_rele(db, FTAG);
}
return;
}
shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
start = blkid >> shift;
ASSERT(start < dn->dn_phys->dn_nblkptr);
end = (blkid + nblks - 1) >> shift;
bp += start;
for (i = start; i <= end; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
ASSERT0(err);
rw_exit(&dn->dn_struct_rwlock);
if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(db->db_blkptr, ==, bp);
(void) free_blocks(dn, bp, 1, tx);
}
dbuf_rele(db, FTAG);
}
if (trunc) {
ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
ASSERTV(uint64_t off);
dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
ASSERTV(off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT));
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
@@ -504,7 +516,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dbuf_will_dirty(dn->dn_dbuf, tx);
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t));
mutex_enter(&dn->dn_mtx);
@@ -535,6 +547,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
boolean_t kill_spill = B_FALSE;
boolean_t freeing_dnode;
ASSERTV(static const dnode_phys_t zerodn = { 0 });
ASSERT(dmu_tx_is_syncing(tx));
@@ -611,13 +624,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dn->dn_next_bonustype[txgoff] = 0;
}
freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg;
/*
* We will either remove a spill block when a file is being removed
* or we have been asked to remove it.
*/
if (dn->dn_rm_spillblk[txgoff] ||
((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && freeing_dnode)) {
if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
kill_spill = B_TRUE;
dn->dn_rm_spillblk[txgoff] = 0;
@@ -640,7 +654,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
(void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@@ -656,7 +670,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
kmem_free(rp, sizeof (free_range_t));
}
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
if (freeing_dnode) {
dnode_sync_free(dn, tx);
return;
}