4047 panic from dbuf_free_range() from dmu_free_object() while
     doing zfs receive
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

References:
  https://www.illumos.org/issues/4047
  illumos/illumos-gate@713d6c2088

Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1775

Porting notes:

1. The exported symbol dmu_free_object() was renamed to
   dmu_free_long_object() in Illumos.
This commit is contained in:
Matthew Ahrens 2013-08-20 20:11:52 -08:00 committed by Brian Behlendorf
parent 46ba1e59d3
commit b663a23d36
8 changed files with 99 additions and 89 deletions

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/ */
@ -579,7 +579,7 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx); uint64_t size, dmu_tx_t *tx);
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size); uint64_t size);
int dmu_free_object(objset_t *os, uint64_t object); int dmu_free_long_object(objset_t *os, uint64_t object);
/* /*
* Convenience functions. * Convenience functions.

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_DNODE_H #ifndef _SYS_DNODE_H
@ -188,6 +188,8 @@ typedef struct dnode {
/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
uint32_t dn_dbufs_count; /* count of dn_dbufs */ uint32_t dn_dbufs_count; /* count of dn_dbufs */
/* There are no level-0 blocks of this blkid or higher in dn_dbufs */
uint64_t dn_unlisted_l0_blkid;
/* protected by os_lock: */ /* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */

View File

@ -64,6 +64,12 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
void *tag, dmu_buf_impl_t **dbp, int depth); void *tag, dmu_buf_impl_t **dbp, int depth);
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
/*
* Number of times that zfs_free_range() took the slow path while doing
* a zfs receive. A nonzero value indicates a potential performance problem.
*/
uint64_t zfs_free_range_recv_miss;
static void dbuf_destroy(dmu_buf_impl_t *db); static void dbuf_destroy(dmu_buf_impl_t *db);
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@ -869,20 +875,22 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
} }
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
if (dmu_objset_is_receiving(dn->dn_objset)) { mutex_enter(&dn->dn_dbufs_mtx);
/* if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
* When processing a free record from a zfs receive, /* There can't be any dbufs in this range; no need to search. */
* there should have been no previous modifications to the mutex_exit(&dn->dn_dbufs_mtx);
* data in this range. Therefore there should be no dbufs
* in the range. Searching dn_dbufs for these non-existent
* dbufs can be very expensive, so simply ignore this.
*/
VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
return; return;
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
/*
* If we are receiving, we expect there to be no dbufs in
* the range to be freed, because receive modifies each
* block at most once, and in offset order. If this is
* not the case, it can lead to performance problems,
* so note that we unexpectedly took the slow path.
*/
atomic_inc_64(&zfs_free_range_recv_miss);
} }
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) { for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db); db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@ -1781,6 +1789,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
return (odb); return (odb);
} }
list_insert_head(&dn->dn_dbufs, db); list_insert_head(&dn->dn_dbufs, db);
if (db->db_level == 0 && db->db_blkid >=
dn->dn_unlisted_l0_blkid)
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx); mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);

View File

@ -568,98 +568,95 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
* the end so that the file gets shorter over time (if we crashes in the * the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file * middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects. * data by simply searching the allocated level 1 indirects.
*
* On input, *start should be the first offset that does not need to be
* freed (e.g. "offset + length"). On return, *start will be the first
* offset that should be freed.
*/ */
static int static int
get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
{ {
uint64_t len = *start - limit; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
uint64_t blkcnt = 0; /* bytes of data covered by a level-1 indirect block */
uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
uint64_t iblkrange = uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
uint64_t blks;
ASSERT(limit <= *start); ASSERT3U(minimum, <=, *start);
if (len <= iblkrange * maxblks) { if (*start - minimum <= iblkrange * maxblks) {
*start = limit; *start = minimum;
return (0); return (0);
} }
ASSERT(ISP2(iblkrange)); ASSERT(ISP2(iblkrange));
while (*start > limit && blkcnt < maxblks) { for (blks = 0; *start > minimum && blks < maxblks; blks++) {
int err; int err;
/* find next allocated L1 indirect */ /*
* dnode_next_offset(BACKWARDS) will find an allocated L1
* indirect block at or before the input offset. We must
* decrement *start so that it is at the end of the region
* to search.
*/
(*start)--;
err = dnode_next_offset(dn, err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0); DNODE_FIND_BACKWARDS, start, 2, 1, 0);
/* if there are no more, then we are done */ /* if there are no indirect blocks before start, we are done */
if (err == ESRCH) { if (err == ESRCH) {
*start = limit; *start = minimum;
return (0); break;
} else if (err) { } else if (err != 0) {
return (err); return (err);
} }
blkcnt += 1;
/* reset offset to end of "next" block back */ /* set start to the beginning of this L1 indirect */
*start = P2ALIGN(*start, iblkrange); *start = P2ALIGN(*start, iblkrange);
if (*start <= limit)
*start = limit;
else
*start -= 1;
} }
if (*start < minimum)
*start = minimum;
return (0); return (0);
} }
static int static int
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t free_dnode) uint64_t length)
{ {
dmu_tx_t *tx; uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
uint64_t object_size, start, end, len; int err;
boolean_t trunc = (length == DMU_OBJECT_END);
int align, err;
align = 1 << dn->dn_datablkshift; if (offset >= object_size)
ASSERT(align > 0);
object_size = align == 1 ? dn->dn_datablksz :
(dn->dn_maxblkid + 1) << dn->dn_datablkshift;
end = offset + length;
if (trunc || end > object_size)
end = object_size;
if (end <= offset)
return (0); return (0);
length = end - offset;
while (length) { if (length == DMU_OBJECT_END || offset + length > object_size)
start = end; length = object_size - offset;
/* assert(offset <= start) */
err = get_next_chunk(dn, &start, offset); while (length != 0) {
uint64_t chunk_end, chunk_begin;
dmu_tx_t *tx;
chunk_end = chunk_begin = offset + length;
/* move chunk_begin backwards to the beginning of this chunk */
err = get_next_chunk(dn, &chunk_begin, offset);
if (err) if (err)
return (err); return (err);
len = trunc ? DMU_OBJECT_END : end - start; ASSERT3U(chunk_begin, >=, offset);
ASSERT3U(chunk_begin, <=, chunk_end);
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, dn->dn_object, start, len); dmu_tx_hold_free(tx, dn->dn_object,
chunk_begin, chunk_end - chunk_begin);
err = dmu_tx_assign(tx, TXG_WAIT); err = dmu_tx_assign(tx, TXG_WAIT);
if (err) { if (err) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
return (err); return (err);
} }
dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
dnode_free_range(dn, start, trunc ? -1 : len, tx);
if (start == 0 && free_dnode) {
ASSERT(trunc);
dnode_free(dn, tx);
}
length -= end - start;
dmu_tx_commit(tx); dmu_tx_commit(tx);
end = start;
length -= chunk_end - chunk_begin;
} }
return (0); return (0);
} }
@ -674,38 +671,32 @@ dmu_free_long_range(objset_t *os, uint64_t object,
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err != 0) if (err != 0)
return (err); return (err);
err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); err = dmu_free_long_range_impl(os, dn, offset, length);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
return (err); return (err);
} }
int int
dmu_free_object(objset_t *os, uint64_t object) dmu_free_long_object(objset_t *os, uint64_t object)
{ {
dnode_t *dn;
dmu_tx_t *tx; dmu_tx_t *tx;
int err; int err;
err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
FTAG, &dn);
if (err != 0) if (err != 0)
return (err); return (err);
if (dn->dn_nlevels == 1) {
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, object); dmu_tx_hold_bonus(tx, object);
dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
err = dmu_tx_assign(tx, TXG_WAIT); err = dmu_tx_assign(tx, TXG_WAIT);
if (err == 0) { if (err == 0) {
dnode_free_range(dn, 0, DMU_OBJECT_END, tx); err = dmu_object_free(os, object, tx);
dnode_free(dn, tx);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} else { } else {
dmu_tx_abort(tx); dmu_tx_abort(tx);
} }
} else {
err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
}
dnode_rele(dn, FTAG);
return (err); return (err);
} }
@ -2042,7 +2033,7 @@ EXPORT_SYMBOL(dmu_buf_rele_array);
EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_prefetch);
EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_range);
EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_object); EXPORT_SYMBOL(dmu_free_long_object);
EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read);
EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write);
EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_prealloc);

View File

@ -1262,7 +1262,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
if (dmu_object_info(os, obj, NULL) != 0) if (dmu_object_info(os, obj, NULL) != 0)
continue; continue;
err = dmu_free_object(os, obj); err = dmu_free_long_object(os, obj);
if (err != 0) if (err != 0)
return (err); return (err);
} }

View File

@ -632,6 +632,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
* if they are blocksize-aligned. * if they are blocksize-aligned.
*/ */
if (dn->dn_datablkshift == 0) { if (dn->dn_datablkshift == 0) {
if (off != 0 || len < dn->dn_datablksz)
dmu_tx_count_write(txh, off, len); dmu_tx_count_write(txh, off, len);
} else { } else {
/* first block will be modified if it is not aligned */ /* first block will be modified if it is not aligned */

View File

@ -117,6 +117,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
dn->dn_id_flags = 0; dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0; dn->dn_dbufs_count = 0;
dn->dn_unlisted_l0_blkid = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link)); offsetof(dmu_buf_impl_t, db_link));
@ -169,6 +170,7 @@ dnode_dest(void *arg, void *unused)
ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_id_flags);
ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_dbufs_count);
ASSERT0(dn->dn_unlisted_l0_blkid);
list_destroy(&dn->dn_dbufs); list_destroy(&dn->dn_dbufs);
} }
@ -472,6 +474,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_newuid = 0; dn->dn_newuid = 0;
dn->dn_newgid = 0; dn->dn_newgid = 0;
dn->dn_id_flags = 0; dn->dn_id_flags = 0;
dn->dn_unlisted_l0_blkid = 0;
dmu_zfetch_rele(&dn->dn_zfetch); dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn); kmem_cache_free(dnode_cache, dn);
@ -703,6 +706,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ASSERT(list_is_empty(&ndn->dn_dbufs)); ASSERT(list_is_empty(&ndn->dn_dbufs));
list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_dbufs_count = odn->dn_dbufs_count;
ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
ndn->dn_bonus = odn->dn_bonus; ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_have_spill = odn->dn_have_spill;
ndn->dn_zio = odn->dn_zio; ndn->dn_zio = odn->dn_zio;
@ -737,6 +741,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link)); offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0; odn->dn_dbufs_count = 0;
odn->dn_unlisted_l0_blkid = 0;
odn->dn_bonus = NULL; odn->dn_bonus = NULL;
odn->dn_zfetch.zf_dnode = NULL; odn->dn_zfetch.zf_dnode = NULL;
@ -1524,7 +1529,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
blkshift = dn->dn_datablkshift; blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
if (len == -1ULL) { if (len == DMU_OBJECT_END) {
len = UINT64_MAX - off; len = UINT64_MAX - off;
trunc = TRUE; trunc = TRUE;
} }

View File

@ -905,7 +905,7 @@ dsl_destroy_head(const char *name)
for (obj = 0; error == 0; for (obj = 0; error == 0;
error = dmu_object_next(os, &obj, FALSE, error = dmu_object_next(os, &obj, FALSE,
prev_snap_txg)) prev_snap_txg))
(void) dmu_free_object(os, obj); (void) dmu_free_long_object(os, obj);
/* sync out all frees */ /* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0); txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG); dmu_objset_disown(os, FTAG);