mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 19:04:45 +03:00
Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/4390 https://github.com/illumos/illumos-gate/commit/7fd05ac Porting notes: Previous stack-reduction efforts in traverse_visitb() caused a fair number of un-mergable pieces of code. This patch should reduce its stack footprint a bit more. The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated using kmem_zalloc() for the purpose of stack reduction. The new global zfs_free_leak_on_eio has been defined as an integer rather than a boolean_t as was the case with the related zfs_recover global. Also, zfs_free_leak_on_eio's definition has been inserted into zfs_debug.c for consistency with the existing definition of zfs_recover. Illumos placed it in spa_misc.c. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2545
This commit is contained in:
committed by
Brian Behlendorf
parent
9b67f60560
commit
fbeddd60b7
+87
-22
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
|
||||
return (dmu_object_free(os, obj, tx));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
bptree_is_empty(objset_t *os, uint64_t obj)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
bptree_phys_t *bt;
|
||||
boolean_t rv;
|
||||
|
||||
VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
|
||||
bt = db->db_data;
|
||||
rv = (bt->bt_begin == bt->bt_end);
|
||||
dmu_buf_rele(db, FTAG);
|
||||
return (rv);
|
||||
}
|
||||
|
||||
void
|
||||
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
|
||||
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
bptree_phys_t *bt;
|
||||
bptree_entry_phys_t bte;
|
||||
bptree_entry_phys_t *bte;
|
||||
|
||||
/*
|
||||
* bptree objects are in the pool mos, therefore they can only be
|
||||
@@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
|
||||
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
|
||||
bt = db->db_data;
|
||||
|
||||
bte.be_birth_txg = birth_txg;
|
||||
bte.be_bp = *bp;
|
||||
bzero(&bte.be_zb, sizeof (bte.be_zb));
|
||||
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
|
||||
bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
|
||||
bte->be_birth_txg = birth_txg;
|
||||
bte->be_bp = *bp;
|
||||
dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
|
||||
kmem_free(bte, sizeof (*bte));
|
||||
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
bt->bt_end++;
|
||||
@@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* If "free" is set:
|
||||
* - It is assumed that "func" will be freeing the block pointers.
|
||||
* - If "func" returns nonzero, the bookmark will be remembered and
|
||||
* iteration will be restarted from this point on next invocation.
|
||||
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
|
||||
* bptree_iterate will remember the bookmark, continue traversing
|
||||
* any additional entries, and return 0.
|
||||
*
|
||||
* If "free" is not set, traversal will stop and return an error if
|
||||
* an i/o error is encountered.
|
||||
*
|
||||
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
|
||||
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
|
||||
* traverse_dataset_destroyed()).
|
||||
*/
|
||||
int
|
||||
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
|
||||
void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
boolean_t ioerr = B_FALSE;
|
||||
int err;
|
||||
uint64_t i;
|
||||
dmu_buf_t *db;
|
||||
@@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
|
||||
bptree_entry_phys_t bte;
|
||||
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
|
||||
|
||||
ASSERT(!free || i == ba.ba_phys->bt_begin);
|
||||
|
||||
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
|
||||
&bte, DMU_READ_NO_PREFETCH);
|
||||
if (err != 0)
|
||||
break;
|
||||
|
||||
if (zfs_recover)
|
||||
if (zfs_free_leak_on_eio)
|
||||
flags |= TRAVERSE_HARD;
|
||||
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
|
||||
"bookmark %lld/%lld/%lld/%lld",
|
||||
i, (longlong_t)bte.be_birth_txg,
|
||||
(longlong_t)bte.be_zb.zb_objset,
|
||||
(longlong_t)bte.be_zb.zb_object,
|
||||
(longlong_t)bte.be_zb.zb_level,
|
||||
(longlong_t)bte.be_zb.zb_blkid);
|
||||
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
|
||||
bte.be_birth_txg, &bte.be_zb, flags,
|
||||
bptree_visit_cb, &ba);
|
||||
if (free) {
|
||||
if (err == ERESTART) {
|
||||
/*
|
||||
* The callback has freed the visited block pointers.
|
||||
* Record our traversal progress on disk, either by
|
||||
* updating this record's bookmark, or by logically
|
||||
* removing this record by advancing bt_begin.
|
||||
*/
|
||||
if (err != 0) {
|
||||
/* save bookmark for future resume */
|
||||
ASSERT3U(bte.be_zb.zb_objset, ==,
|
||||
ZB_DESTROYED_OBJSET);
|
||||
ASSERT0(bte.be_zb.zb_level);
|
||||
dmu_write(os, obj, i * sizeof (bte),
|
||||
sizeof (bte), &bte, tx);
|
||||
break;
|
||||
}
|
||||
if (err != 0) {
|
||||
if (err == EIO || err == ECKSUM ||
|
||||
err == ENXIO) {
|
||||
/*
|
||||
* Skip the rest of this tree and
|
||||
* continue on to the next entry.
|
||||
*/
|
||||
err = 0;
|
||||
ioerr = B_TRUE;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else if (ioerr) {
|
||||
/*
|
||||
* We can not properly handle an i/o
|
||||
* error, because the traversal code
|
||||
* does not know how to resume from an
|
||||
* arbitrary bookmark.
|
||||
* This entry is finished, but there were
|
||||
* i/o errors on previous entries, so we
|
||||
* can't adjust bt_begin. Set this entry's
|
||||
* be_birth_txg such that it will be
|
||||
* treated as a no-op in future traversals.
|
||||
*/
|
||||
zfs_panic_recover("error %u from "
|
||||
"traverse_dataset_destroyed()", err);
|
||||
bte.be_birth_txg = UINT64_MAX;
|
||||
dmu_write(os, obj, i * sizeof (bte),
|
||||
sizeof (bte), &bte, tx);
|
||||
}
|
||||
|
||||
ba.ba_phys->bt_begin++;
|
||||
(void) dmu_free_range(os, obj,
|
||||
i * sizeof (bte), sizeof (bte), tx);
|
||||
if (!ioerr) {
|
||||
ba.ba_phys->bt_begin++;
|
||||
(void) dmu_free_range(os, obj,
|
||||
i * sizeof (bte), sizeof (bte), tx);
|
||||
}
|
||||
} else if (err != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
|
||||
ASSERT(!free || err != 0 || ioerr ||
|
||||
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
|
||||
|
||||
/* if all blocks are free there should be no used space */
|
||||
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
|
||||
if (zfs_free_leak_on_eio) {
|
||||
ba.ba_phys->bt_bytes = 0;
|
||||
ba.ba_phys->bt_comp = 0;
|
||||
ba.ba_phys->bt_uncomp = 0;
|
||||
}
|
||||
|
||||
ASSERT0(ba.ba_phys->bt_bytes);
|
||||
ASSERT0(ba.ba_phys->bt_comp);
|
||||
ASSERT0(ba.ba_phys->bt_uncomp);
|
||||
|
||||
Reference in New Issue
Block a user