diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index b9960782e..afee869b6 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -311,6 +311,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7939a7ba6..0be72e90a 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1889,6 +1889,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -2040,6 +2106,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); + dnode_dirty_l1range(dn, first, last, tx); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 22b401ab5..3202faf49 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -230,9 +230,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; @@ -284,32 +299,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); - free_children(subdb, blkid, nblks, tx); + free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } - /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); free_blocks(dn, db->db_blkptr, 1, tx); - } else { - /* - * Partial block free; must be marked dirty so that it - * will be written out. - */ - ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); @@ -322,7 +321,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; @@ -362,7 +361,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, tx); + free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } @@ -387,6 +386,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void @@ -396,7 +396,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } @@ -712,6 +713,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 89563189f..c3e3887a8 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -743,7 +743,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_heirarchy', - 'send_freeobjects', 'send_realloc_dnode_size'] + 'send_freeobjects', 'send_realloc_dnode_size', 'send_hole_birth'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index fd00d6c49..79e01d190 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -39,7 +39,8 @@ dist_pkgdata_SCRIPTS = \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ send_freeobjects.ksh \ - send_realloc_dnode_size.ksh + send_realloc_dnode_size.ksh \ + send_hole_birth.ksh dist_pkgdata_DATA = \ rsend.cfg \ diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 909cd1677..72d2eb93d 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -153,6 +153,20 @@ function cleanup_pools destroy_pool $POOL3 } +function cmp_md5s { + typeset file1=$1 + typeset file2=$2 + + eval md5sum $file1 | awk '{ print $1 }' > $BACKDIR/md5_file1 + eval md5sum $file2 | awk '{ print $1 }' > $BACKDIR/md5_file2 + diff $BACKDIR/md5_file1 $BACKDIR/md5_file2 + typeset -i ret=$? + + rm -f $BACKDIR/md5_file1 $BACKDIR/md5_file2 + + return $ret +} + # # Detect if the given two filesystems have same sub-datasets # @@ -388,13 +402,36 @@ function mk_files maxsize=$2 file_id_offset=$3 fs=$4 + bs=512 for ((i=0; i<$nfiles; i=i+1)); do - dd if=/dev/urandom \ - of=/$fs/file-$maxsize-$((i+$file_id_offset)) \ - bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \ - count=1 >/dev/null 2>&1 || log_fail \ - "Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))" + file_name="/$fs/file-$maxsize-$((i+$file_id_offset))" + file_size=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) + + # + # Create an interesting mix of files which contain both + # data blocks and holes for more realistic test coverage. + # Half the files are created as sparse then partially filled, + # the other half is dense then a hole is punched in the file. + # + if [ $((RANDOM % 2)) -eq 0 ]; then + truncate -s $file_size $file_name || \ + log_fail "Failed to create $file_name" + dd if=/dev/urandom of=$file_name \ + bs=$bs count=$(($file_size / 2 / $bs)) \ + seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \ + conv=notrunc >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + else + dd if=/dev/urandom of=$file_name \ + bs=$file_size count=1 >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + dd if=/dev/zero of=$file_name \ + bs=$bs count=$(($file_size / 2 / $bs)) \ + seek=$(($RANDOM % (($file_size / 2 / $bs) + 1))) \ + conv=notrunc >/dev/null 2>&1 || \ + log_fail "Failed to create $file_name" + fi done echo Created $nfiles files of random sizes up to $maxsize bytes } diff --git a/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh new file mode 100755 index 000000000..c2b5ff7a0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh @@ -0,0 +1,123 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# DESCRIPTION: +# Verify send streams which contain holes. +# +# STRATEGY: +# 1. Create an initial file for the full send and snapshot. +# 2. Permute the file with holes and snapshot. +# 3. Send the full and incremental snapshots to a new pool. +# 4. Verify the contents of the files match. +# + +sendpool=$POOL +sendfs=$sendpool/sendfs + +recvpool=$POOL2 +recvfs=$recvpool/recvfs + +verify_runnable "both" + +log_assert "Test hole_birth" +log_onexit cleanup + +function cleanup +{ + cleanup_pool $sendpool + cleanup_pool $recvpool + set_tunable64 send_holes_without_birth_time 1 +} + +function send_and_verify +{ + log_must eval "zfs send $sendfs@snap1 > $BACKDIR/pool-snap1" + log_must eval "zfs receive -F $recvfs < $BACKDIR/pool-snap1" + + log_must eval "zfs send -i $sendfs@snap1 $sendfs@snap2 " \ + ">$BACKDIR/pool-snap1-snap2" + log_must eval "zfs receive $recvfs < $BACKDIR/pool-snap1-snap2" + + log_must cmp_md5s /$sendfs/file1 /$recvfs/file1 +} + +# By default sending hole_birth times is disabled. This functionality needs +# to be re-enabled for this test case to verify correctness. Once we're +# comfortable that all hole_birth bugs has been resolved this behavior may +# be re-enabled by default. +log_must set_tunable64 send_holes_without_birth_time 0 + +# Incremental send truncating the file and adding new data. +log_must zfs create -o recordsize=4k $sendfs + +log_must truncate -s 1G /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=11264 seek=1152 +log_must zfs snapshot $sendfs@snap1 + +log_must truncate -s 4194304 /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=152 seek=384 \ + conv=notrunc +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=4k count=10 seek=1408 \ + conv=notrunc +log_must zfs snapshot $sendfs@snap2 + +send_and_verify +log_must cleanup_pool $sendpool +log_must cleanup_pool $recvpool + +# Incremental send appending a hole and data. +log_must zfs create -o recordsize=512 $sendfs + +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=1 +log_must zfs snapshot $sendfs@snap1 + +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=128k count=1 seek=3 +log_must zfs snapshot $sendfs@snap2 + +send_and_verify +log_must cleanup_pool $sendpool +log_must cleanup_pool $recvpool + +# Incremental send truncating the file and adding new data. +log_must zfs create -o recordsize=512 $sendfs + +log_must truncate -s 300M /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=128k conv=notrunc +log_must zfs snapshot $sendfs@snap1 + +log_must truncate -s 10M /$sendfs/file1 +log_must dd if=/dev/urandom of=/$sendfs/file1 bs=512 count=1 seek=96k \ + conv=notrunc +log_must zfs snapshot $sendfs@snap2 + +send_and_verify + +log_pass "Test hole_birth"