Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol

4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/4390
  https://github.com/illumos/illumos-gate/commit/7fd05ac

Porting notes:

Previous stack-reduction efforts in traverse_visitb() caused a fair
number of un-mergable pieces of code.  This patch should reduce its
stack footprint a bit more.

The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated
using kmem_zalloc() for the purpose of stack reduction.

The new global zfs_free_leak_on_eio has been defined as an integer
rather than a boolean_t as was the case with the related zfs_recover
global.  Also, zfs_free_leak_on_eio's definition has been inserted into
zfs_debug.c for consistency with the existing definition of zfs_recover.
Illumos placed it in spa_misc.c.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2545
This commit is contained in:
Matthew Ahrens
2014-06-05 13:20:08 -08:00
committed by Brian Behlendorf
parent 9b67f60560
commit fbeddd60b7
17 changed files with 339 additions and 157 deletions
+54 -73
View File
@@ -58,12 +58,11 @@ typedef struct traverse_data {
zbookmark_t *td_resume;
int td_flags;
prefetch_data_t *td_pfd;
boolean_t td_paused;
blkptr_cb_t *td_func;
void *td_arg;
} traverse_data_t;
#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD)
static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object);
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
@@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* If we found the block we're trying to resume from, zero
* the bookmark out to indicate that we have resumed.
*/
ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
bzero(td->td_resume, sizeof (*zb));
if (td->td_flags & TRAVERSE_POST)
@@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
return (RESUME_SKIP_NONE);
}
static void
traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
{
ASSERT(td->td_resume != NULL);
ASSERT0(zb->zb_level);
bcopy(zb, td->td_resume, sizeof (*td->td_resume));
}
static void
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_t *zb)
@@ -211,9 +201,8 @@ static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
const blkptr_t *bp, const zbookmark_t *zb)
{
int err = 0, lasterr = 0;
int err = 0;
arc_buf_t *buf = NULL;
boolean_t pause = B_FALSE;
switch (resume_skip_check(td, dnp, zb)) {
case RESUME_SKIP_ALL:
@@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
if (BP_IS_HOLE(bp)) {
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
return (err);
if (err != 0)
goto post;
return (0);
}
if (td->td_pfd && !td->td_pfd->pd_exited &&
@@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
if (err == ERESTART)
pause = B_TRUE; /* handle pausing at a common point */
if (err != 0)
goto post;
}
@@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);
@@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp,
&((blkptr_t *)buf->b_data)[i], czb);
if (err != 0) {
if (!TD_HARD(td))
break;
lasterr = err;
}
if (err != 0)
break;
}
kmem_free(czb, sizeof (zbookmark_t));
@@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
dnp = buf->b_data;
for (i = 0; i < epb; i++) {
@@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
for (i = 0; i < epb; i++) {
err = traverse_dnode(td, &dnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
if (err != 0) {
if (!TD_HARD(td))
break;
lasterr = err;
}
if (err != 0)
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT;
@@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
goto post;
osp = buf->b_data;
dnp = &osp->os_meta_dnode;
@@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
}
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, zb->zb_objset,
@@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
(void) arc_buf_remove_ref(buf, &buf);
post:
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
if (err == 0 && (td->td_flags & TRAVERSE_POST))
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
if (err == ERESTART)
pause = B_TRUE;
if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
/*
* Ignore this disk error as requested by the HARD flag,
* and continue traversal.
*/
err = 0;
}
if (pause && td->td_resume != NULL) {
ASSERT3U(err, ==, ERESTART);
ASSERT(!TD_HARD(td));
traverse_pause(td, zb);
/*
* If we are stopping here, set td_resume.
*/
if (td->td_resume != NULL && err != 0 && !td->td_paused) {
td->td_resume->zb_objset = zb->zb_objset;
td->td_resume->zb_object = zb->zb_object;
td->td_resume->zb_level = 0;
/*
* If we have stopped on an indirect block (e.g. due to
* i/o error), we have not visited anything below it.
* Set the bookmark to the first level-0 block that we need
* to visit. This way, the resuming code does not need to
* deal with resuming from indirect blocks.
*/
td->td_resume->zb_blkid = zb->zb_blkid <<
(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
td->td_paused = B_TRUE;
}
return (err != 0 ? err : lasterr);
return (err);
}
static void
@@ -426,29 +419,21 @@ static int
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object)
{
int j, err = 0, lasterr = 0;
int j, err = 0;
zbookmark_t czb;
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
if (err != 0) {
if (!TD_HARD(td))
break;
lasterr = err;
}
if (err != 0)
break;
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
if (err != 0) {
if (!TD_HARD(td))
return (err);
lasterr = err;
}
}
return (err != 0 ? err : lasterr);
return (err);
}
/* ARGSUSED */
@@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
td->td_arg = arg;
td->td_pfd = pd;
td->td_flags = flags;
td->td_paused = B_FALSE;
pd->pd_blks_max = zfs_pd_blks_max;
pd->pd_flags = flags;
@@ -617,7 +603,7 @@ int
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
int err, lasterr = 0;
int err;
uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
@@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
return (err);
/* visit each dataset */
for (obj = 1; err == 0 || (err != ESRCH && hard);
for (obj = 1; err == 0;
err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
continue;
if (hard)
continue;
break;
}
if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
dsl_pool_config_exit(dp, FTAG);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
continue;
if (hard)
continue;
break;
}
if (ds->ds_phys->ds_prev_snap_txg > txg)
txg = ds->ds_phys->ds_prev_snap_txg;
err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0) {
if (!hard)
return (err);
lasterr = err;
}
if (err != 0)
break;
}
}
if (err == ESRCH)
err = 0;
return (err != 0 ? err : lasterr);
return (err);
}
#if defined(_KERNEL) && defined(HAVE_SPL)