mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-25 18:59:33 +03:00
Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/4390 https://github.com/illumos/illumos-gate/commit/7fd05ac Porting notes: Previous stack-reduction efforts in traverse_visitb() caused a fair number of un-mergable pieces of code. This patch should reduce its stack footprint a bit more. The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated using kmem_zalloc() for the purpose of stack reduction. The new global zfs_free_leak_on_eio has been defined as an integer rather than a boolean_t as was the case with the related zfs_recover global. Also, zfs_free_leak_on_eio's definition has been inserted into zfs_debug.c for consistency with the existing definition of zfs_recover. Illumos placed it in spa_misc.c. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2545
This commit is contained in:
parent
9b67f60560
commit
fbeddd60b7
@ -19,7 +19,7 @@
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_BPTREE_H
|
||||
@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
|
||||
|
||||
uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
|
||||
int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
|
||||
boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
|
||||
|
||||
void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
|
||||
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
|
||||
|
@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size);
|
||||
|
||||
#define DMU_USERUSED_OBJECT (-1ULL)
|
||||
#define DMU_GROUPUSED_OBJECT (-2ULL)
|
||||
#define DMU_DEADLIST_OBJECT (-3ULL)
|
||||
|
||||
/*
|
||||
* artificial blkids for bonus buffer and spill blocks
|
||||
|
@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
|
||||
#define ORIGIN_DIR_NAME "$ORIGIN"
|
||||
#define XLATION_DIR_NAME "$XLATION"
|
||||
#define FREE_DIR_NAME "$FREE"
|
||||
#define LEAK_DIR_NAME "$LEAK"
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
#define dprintf_dd(dd, fmt, ...) do { \
|
||||
|
@ -87,6 +87,7 @@ typedef struct dsl_pool {
|
||||
struct dsl_dir *dp_root_dir;
|
||||
struct dsl_dir *dp_mos_dir;
|
||||
struct dsl_dir *dp_free_dir;
|
||||
struct dsl_dir *dp_leak_dir;
|
||||
struct dsl_dataset *dp_origin_snap;
|
||||
uint64_t dp_root_dir_obj;
|
||||
struct taskq *dp_iput_taskq;
|
||||
|
@ -116,6 +116,7 @@ typedef struct dsl_scan {
|
||||
/* for freeing blocks */
|
||||
boolean_t scn_is_bptree;
|
||||
boolean_t scn_async_destroying;
|
||||
boolean_t scn_async_stalled;
|
||||
|
||||
/* for debugging / information */
|
||||
uint64_t scn_visited_this_txg;
|
||||
|
@ -193,6 +193,7 @@ typedef enum {
|
||||
ZPOOL_PROP_COMMENT,
|
||||
ZPOOL_PROP_EXPANDSZ,
|
||||
ZPOOL_PROP_FREEING,
|
||||
ZPOOL_PROP_LEAKED,
|
||||
ZPOOL_NUM_PROPS
|
||||
} zpool_prop_t;
|
||||
|
||||
|
@ -48,6 +48,7 @@ extern "C" {
|
||||
|
||||
extern int zfs_flags;
|
||||
extern int zfs_recover;
|
||||
extern int zfs_free_leak_on_eio;
|
||||
|
||||
#define ZFS_DEBUG_DPRINTF (1<<0)
|
||||
#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
|
||||
|
@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
|
||||
case ZPOOL_PROP_ALLOCATED:
|
||||
case ZPOOL_PROP_FREE:
|
||||
case ZPOOL_PROP_FREEING:
|
||||
case ZPOOL_PROP_LEAKED:
|
||||
case ZPOOL_PROP_EXPANDSZ:
|
||||
case ZPOOL_PROP_ASHIFT:
|
||||
if (literal)
|
||||
|
@ -696,6 +696,43 @@ Set additional debugging flags
|
||||
Default value: \fB1\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
\fBzfs_free_leak_on_eio\fR (int)
|
||||
.ad
|
||||
.RS 12n
|
||||
If destroy encounters an EIO while reading metadata (e.g. indirect
|
||||
blocks), space referenced by the missing metadata can not be freed.
|
||||
Normally this causes the background destroy to become "stalled", as
|
||||
it is unable to make forward progress. While in this stalled state,
|
||||
all remaining space to free from the error-encountering filesystem is
|
||||
"temporarily leaked". Set this flag to cause it to ignore the EIO,
|
||||
permanently leak the space from indirect blocks that can not be read,
|
||||
and continue to free everything else that it can.
|
||||
|
||||
The default, "stalling" behavior is useful if the storage partially
|
||||
fails (i.e. some but not all i/os fail), and then later recovers. In
|
||||
this case, we will be able to continue pool operations while it is
|
||||
partially failed, and when it recovers, we can continue to free the
|
||||
space, with no leaks. However, note that this case is actually
|
||||
fairly rare.
|
||||
|
||||
Typically pools either (a) fail completely (but perhaps temporarily,
|
||||
e.g. a top-level vdev going offline), or (b) have localized,
|
||||
permanent errors (e.g. disk returns the wrong data due to bit flip or
|
||||
firmware bug). In case (a), this setting does not matter because the
|
||||
pool will be suspended and the sync thread will not be able to make
|
||||
forward progress regardless. In case (b), because the error is
|
||||
permanent, the best we can do is leak the minimum amount of space,
|
||||
which is what setting this flag will do. Therefore, it is reasonable
|
||||
for this flag to normally be set, but we chose the more conservative
|
||||
approach of not setting it, so that there is no possibility of
|
||||
leaking space in the "partial temporary" failure case.
|
||||
.sp
|
||||
Default value: \fB0\fR.
|
||||
.RE
|
||||
|
||||
.sp
|
||||
.ne 2
|
||||
.na
|
||||
|
@ -81,6 +81,8 @@ zpool_prop_init(void)
|
||||
ZFS_TYPE_POOL, "<size>", "FREE");
|
||||
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
|
||||
ZFS_TYPE_POOL, "<size>", "FREEING");
|
||||
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
|
||||
ZFS_TYPE_POOL, "<size>", "LEAKED");
|
||||
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
|
||||
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
|
||||
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
|
||||
|
@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
|
||||
return (dmu_object_free(os, obj, tx));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
bptree_is_empty(objset_t *os, uint64_t obj)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
bptree_phys_t *bt;
|
||||
boolean_t rv;
|
||||
|
||||
VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
|
||||
bt = db->db_data;
|
||||
rv = (bt->bt_begin == bt->bt_end);
|
||||
dmu_buf_rele(db, FTAG);
|
||||
return (rv);
|
||||
}
|
||||
|
||||
void
|
||||
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
|
||||
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
bptree_phys_t *bt;
|
||||
bptree_entry_phys_t bte;
|
||||
bptree_entry_phys_t *bte;
|
||||
|
||||
/*
|
||||
* bptree objects are in the pool mos, therefore they can only be
|
||||
@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
|
||||
VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
|
||||
bt = db->db_data;
|
||||
|
||||
bte.be_birth_txg = birth_txg;
|
||||
bte.be_bp = *bp;
|
||||
bzero(&bte.be_zb, sizeof (bte.be_zb));
|
||||
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
|
||||
bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
|
||||
bte->be_birth_txg = birth_txg;
|
||||
bte->be_bp = *bp;
|
||||
dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
|
||||
kmem_free(bte, sizeof (*bte));
|
||||
|
||||
dmu_buf_will_dirty(db, tx);
|
||||
bt->bt_end++;
|
||||
@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* If "free" is set:
|
||||
* - It is assumed that "func" will be freeing the block pointers.
|
||||
* - If "func" returns nonzero, the bookmark will be remembered and
|
||||
* iteration will be restarted from this point on next invocation.
|
||||
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
|
||||
* bptree_iterate will remember the bookmark, continue traversing
|
||||
* any additional entries, and return 0.
|
||||
*
|
||||
* If "free" is not set, traversal will stop and return an error if
|
||||
* an i/o error is encountered.
|
||||
*
|
||||
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
|
||||
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
|
||||
* traverse_dataset_destroyed()).
|
||||
*/
|
||||
int
|
||||
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
|
||||
void *arg, dmu_tx_t *tx)
|
||||
{
|
||||
boolean_t ioerr = B_FALSE;
|
||||
int err;
|
||||
uint64_t i;
|
||||
dmu_buf_t *db;
|
||||
@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
|
||||
bptree_entry_phys_t bte;
|
||||
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
|
||||
|
||||
ASSERT(!free || i == ba.ba_phys->bt_begin);
|
||||
|
||||
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
|
||||
&bte, DMU_READ_NO_PREFETCH);
|
||||
if (err != 0)
|
||||
break;
|
||||
|
||||
if (zfs_recover)
|
||||
if (zfs_free_leak_on_eio)
|
||||
flags |= TRAVERSE_HARD;
|
||||
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
|
||||
"bookmark %lld/%lld/%lld/%lld",
|
||||
i, (longlong_t)bte.be_birth_txg,
|
||||
(longlong_t)bte.be_zb.zb_objset,
|
||||
(longlong_t)bte.be_zb.zb_object,
|
||||
(longlong_t)bte.be_zb.zb_level,
|
||||
(longlong_t)bte.be_zb.zb_blkid);
|
||||
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
|
||||
bte.be_birth_txg, &bte.be_zb, flags,
|
||||
bptree_visit_cb, &ba);
|
||||
if (free) {
|
||||
if (err == ERESTART) {
|
||||
/*
|
||||
* The callback has freed the visited block pointers.
|
||||
* Record our traversal progress on disk, either by
|
||||
* updating this record's bookmark, or by logically
|
||||
* removing this record by advancing bt_begin.
|
||||
*/
|
||||
if (err != 0) {
|
||||
/* save bookmark for future resume */
|
||||
ASSERT3U(bte.be_zb.zb_objset, ==,
|
||||
ZB_DESTROYED_OBJSET);
|
||||
ASSERT0(bte.be_zb.zb_level);
|
||||
dmu_write(os, obj, i * sizeof (bte),
|
||||
sizeof (bte), &bte, tx);
|
||||
break;
|
||||
}
|
||||
if (err != 0) {
|
||||
if (err == EIO || err == ECKSUM ||
|
||||
err == ENXIO) {
|
||||
/*
|
||||
* Skip the rest of this tree and
|
||||
* continue on to the next entry.
|
||||
*/
|
||||
err = 0;
|
||||
ioerr = B_TRUE;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else if (ioerr) {
|
||||
/*
|
||||
* We can not properly handle an i/o
|
||||
* error, because the traversal code
|
||||
* does not know how to resume from an
|
||||
* arbitrary bookmark.
|
||||
* This entry is finished, but there were
|
||||
* i/o errors on previous entries, so we
|
||||
* can't adjust bt_begin. Set this entry's
|
||||
* be_birth_txg such that it will be
|
||||
* treated as a no-op in future traversals.
|
||||
*/
|
||||
zfs_panic_recover("error %u from "
|
||||
"traverse_dataset_destroyed()", err);
|
||||
bte.be_birth_txg = UINT64_MAX;
|
||||
dmu_write(os, obj, i * sizeof (bte),
|
||||
sizeof (bte), &bte, tx);
|
||||
}
|
||||
|
||||
ba.ba_phys->bt_begin++;
|
||||
(void) dmu_free_range(os, obj,
|
||||
i * sizeof (bte), sizeof (bte), tx);
|
||||
if (!ioerr) {
|
||||
ba.ba_phys->bt_begin++;
|
||||
(void) dmu_free_range(os, obj,
|
||||
i * sizeof (bte), sizeof (bte), tx);
|
||||
}
|
||||
} else if (err != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
|
||||
ASSERT(!free || err != 0 || ioerr ||
|
||||
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
|
||||
|
||||
/* if all blocks are free there should be no used space */
|
||||
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
|
||||
if (zfs_free_leak_on_eio) {
|
||||
ba.ba_phys->bt_bytes = 0;
|
||||
ba.ba_phys->bt_comp = 0;
|
||||
ba.ba_phys->bt_uncomp = 0;
|
||||
}
|
||||
|
||||
ASSERT0(ba.ba_phys->bt_bytes);
|
||||
ASSERT0(ba.ba_phys->bt_comp);
|
||||
ASSERT0(ba.ba_phys->bt_uncomp);
|
||||
|
@ -58,12 +58,11 @@ typedef struct traverse_data {
|
||||
zbookmark_t *td_resume;
|
||||
int td_flags;
|
||||
prefetch_data_t *td_pfd;
|
||||
boolean_t td_paused;
|
||||
blkptr_cb_t *td_func;
|
||||
void *td_arg;
|
||||
} traverse_data_t;
|
||||
|
||||
#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD)
|
||||
|
||||
static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
uint64_t objset, uint64_t object);
|
||||
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
|
||||
@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
* If we found the block we're trying to resume from, zero
|
||||
* the bookmark out to indicate that we have resumed.
|
||||
*/
|
||||
ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
|
||||
if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
|
||||
bzero(td->td_resume, sizeof (*zb));
|
||||
if (td->td_flags & TRAVERSE_POST)
|
||||
@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
return (RESUME_SKIP_NONE);
|
||||
}
|
||||
|
||||
static void
|
||||
traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
|
||||
{
|
||||
ASSERT(td->td_resume != NULL);
|
||||
ASSERT0(zb->zb_level);
|
||||
bcopy(zb, td->td_resume, sizeof (*td->td_resume));
|
||||
}
|
||||
|
||||
static void
|
||||
traverse_prefetch_metadata(traverse_data_t *td,
|
||||
const blkptr_t *bp, const zbookmark_t *zb)
|
||||
@ -211,9 +201,8 @@ static int
|
||||
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
const blkptr_t *bp, const zbookmark_t *zb)
|
||||
{
|
||||
int err = 0, lasterr = 0;
|
||||
int err = 0;
|
||||
arc_buf_t *buf = NULL;
|
||||
boolean_t pause = B_FALSE;
|
||||
|
||||
switch (resume_skip_check(td, dnp, zb)) {
|
||||
case RESUME_SKIP_ALL:
|
||||
@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
|
||||
return (err);
|
||||
if (err != 0)
|
||||
goto post;
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (td->td_pfd && !td->td_pfd->pd_exited &&
|
||||
@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
td->td_arg);
|
||||
if (err == TRAVERSE_VISIT_NO_CHILDREN)
|
||||
return (0);
|
||||
if (err == ERESTART)
|
||||
pause = B_TRUE; /* handle pausing at a common point */
|
||||
if (err != 0)
|
||||
goto post;
|
||||
}
|
||||
@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
goto post;
|
||||
|
||||
czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);
|
||||
|
||||
@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
zb->zb_blkid * epb + i);
|
||||
err = traverse_visitbp(td, dnp,
|
||||
&((blkptr_t *)buf->b_data)[i], czb);
|
||||
if (err != 0) {
|
||||
if (!TD_HARD(td))
|
||||
break;
|
||||
lasterr = err;
|
||||
}
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
kmem_free(czb, sizeof (zbookmark_t));
|
||||
@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
goto post;
|
||||
dnp = buf->b_data;
|
||||
|
||||
for (i = 0; i < epb; i++) {
|
||||
@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
for (i = 0; i < epb; i++) {
|
||||
err = traverse_dnode(td, &dnp[i], zb->zb_objset,
|
||||
zb->zb_blkid * epb + i);
|
||||
if (err != 0) {
|
||||
if (!TD_HARD(td))
|
||||
break;
|
||||
lasterr = err;
|
||||
}
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
|
||||
uint32_t flags = ARC_WAIT;
|
||||
@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
|
||||
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
goto post;
|
||||
|
||||
osp = buf->b_data;
|
||||
dnp = &osp->os_meta_dnode;
|
||||
@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
|
||||
err = traverse_dnode(td, dnp, zb->zb_objset,
|
||||
DMU_META_DNODE_OBJECT);
|
||||
if (err && TD_HARD(td)) {
|
||||
lasterr = err;
|
||||
err = 0;
|
||||
}
|
||||
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
|
||||
dnp = &osp->os_groupused_dnode;
|
||||
err = traverse_dnode(td, dnp, zb->zb_objset,
|
||||
DMU_GROUPUSED_OBJECT);
|
||||
}
|
||||
if (err && TD_HARD(td)) {
|
||||
lasterr = err;
|
||||
err = 0;
|
||||
}
|
||||
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
|
||||
dnp = &osp->os_userused_dnode;
|
||||
err = traverse_dnode(td, dnp, zb->zb_objset,
|
||||
@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
(void) arc_buf_remove_ref(buf, &buf);
|
||||
|
||||
post:
|
||||
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
|
||||
if (err == 0 && (td->td_flags & TRAVERSE_POST))
|
||||
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
|
||||
if (err == ERESTART)
|
||||
pause = B_TRUE;
|
||||
|
||||
if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
|
||||
/*
|
||||
* Ignore this disk error as requested by the HARD flag,
|
||||
* and continue traversal.
|
||||
*/
|
||||
err = 0;
|
||||
}
|
||||
|
||||
if (pause && td->td_resume != NULL) {
|
||||
ASSERT3U(err, ==, ERESTART);
|
||||
ASSERT(!TD_HARD(td));
|
||||
traverse_pause(td, zb);
|
||||
/*
|
||||
* If we are stopping here, set td_resume.
|
||||
*/
|
||||
if (td->td_resume != NULL && err != 0 && !td->td_paused) {
|
||||
td->td_resume->zb_objset = zb->zb_objset;
|
||||
td->td_resume->zb_object = zb->zb_object;
|
||||
td->td_resume->zb_level = 0;
|
||||
/*
|
||||
* If we have stopped on an indirect block (e.g. due to
|
||||
* i/o error), we have not visited anything below it.
|
||||
* Set the bookmark to the first level-0 block that we need
|
||||
* to visit. This way, the resuming code does not need to
|
||||
* deal with resuming from indirect blocks.
|
||||
*/
|
||||
td->td_resume->zb_blkid = zb->zb_blkid <<
|
||||
(zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
|
||||
td->td_paused = B_TRUE;
|
||||
}
|
||||
|
||||
return (err != 0 ? err : lasterr);
|
||||
return (err);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -426,29 +419,21 @@ static int
|
||||
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
|
||||
uint64_t objset, uint64_t object)
|
||||
{
|
||||
int j, err = 0, lasterr = 0;
|
||||
int j, err = 0;
|
||||
zbookmark_t czb;
|
||||
|
||||
for (j = 0; j < dnp->dn_nblkptr; j++) {
|
||||
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
|
||||
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
|
||||
if (err != 0) {
|
||||
if (!TD_HARD(td))
|
||||
break;
|
||||
lasterr = err;
|
||||
}
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
|
||||
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
|
||||
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
|
||||
if (err != 0) {
|
||||
if (!TD_HARD(td))
|
||||
return (err);
|
||||
lasterr = err;
|
||||
}
|
||||
}
|
||||
return (err != 0 ? err : lasterr);
|
||||
return (err);
|
||||
}
|
||||
|
||||
/* ARGSUSED */
|
||||
@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
|
||||
td->td_arg = arg;
|
||||
td->td_pfd = pd;
|
||||
td->td_flags = flags;
|
||||
td->td_paused = B_FALSE;
|
||||
|
||||
pd->pd_blks_max = zfs_pd_blks_max;
|
||||
pd->pd_flags = flags;
|
||||
@ -617,7 +603,7 @@ int
|
||||
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
|
||||
blkptr_cb_t func, void *arg)
|
||||
{
|
||||
int err, lasterr = 0;
|
||||
int err;
|
||||
uint64_t obj;
|
||||
dsl_pool_t *dp = spa_get_dsl(spa);
|
||||
objset_t *mos = dp->dp_meta_objset;
|
||||
@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
|
||||
return (err);
|
||||
|
||||
/* visit each dataset */
|
||||
for (obj = 1; err == 0 || (err != ESRCH && hard);
|
||||
for (obj = 1; err == 0;
|
||||
err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
|
||||
dmu_object_info_t doi;
|
||||
|
||||
err = dmu_object_info(mos, obj, &doi);
|
||||
if (err != 0) {
|
||||
if (!hard)
|
||||
return (err);
|
||||
lasterr = err;
|
||||
continue;
|
||||
if (hard)
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
|
||||
if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
|
||||
@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
|
||||
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
|
||||
dsl_pool_config_exit(dp, FTAG);
|
||||
if (err != 0) {
|
||||
if (!hard)
|
||||
return (err);
|
||||
lasterr = err;
|
||||
continue;
|
||||
if (hard)
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
if (ds->ds_phys->ds_prev_snap_txg > txg)
|
||||
txg = ds->ds_phys->ds_prev_snap_txg;
|
||||
err = traverse_dataset(ds, txg, flags, func, arg);
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
if (err != 0) {
|
||||
if (!hard)
|
||||
return (err);
|
||||
lasterr = err;
|
||||
}
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (err == ESRCH)
|
||||
err = 0;
|
||||
return (err != 0 ? err : lasterr);
|
||||
return (err);
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
|
@ -245,6 +245,13 @@ dsl_pool_open(dsl_pool_t *dp)
|
||||
dp->dp_meta_objset, obj));
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: errors ignored, because the leak dir will not exist if we
|
||||
* have not encountered a leak yet.
|
||||
*/
|
||||
(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
|
||||
&dp->dp_leak_dir);
|
||||
|
||||
if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
|
||||
@ -292,6 +299,8 @@ dsl_pool_close(dsl_pool_t *dp)
|
||||
dsl_dir_rele(dp->dp_mos_dir, dp);
|
||||
if (dp->dp_free_dir)
|
||||
dsl_dir_rele(dp->dp_free_dir, dp);
|
||||
if (dp->dp_leak_dir)
|
||||
dsl_dir_rele(dp->dp_leak_dir, dp);
|
||||
if (dp->dp_root_dir)
|
||||
dsl_dir_rele(dp->dp_root_dir, dp);
|
||||
|
||||
|
@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
|
||||
int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
|
||||
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
|
||||
int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
|
||||
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
|
||||
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
|
||||
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
|
||||
int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
|
||||
|
||||
@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn)
|
||||
if (spa_shutting_down(spa))
|
||||
return (B_FALSE);
|
||||
if (scn->scn_phys.scn_state == DSS_SCANNING ||
|
||||
scn->scn_async_destroying)
|
||||
(scn->scn_async_destroying && !scn->scn_async_stalled))
|
||||
return (B_TRUE);
|
||||
|
||||
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
|
||||
@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_scan_t *scn = dp->dp_scan;
|
||||
spa_t *spa = dp->dp_spa;
|
||||
int err;
|
||||
int err = 0;
|
||||
|
||||
/*
|
||||
* Check for scn_restart_txg before checking spa_load_state, so
|
||||
@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dsl_scan_setup_sync(&func, tx);
|
||||
}
|
||||
|
||||
if (!dsl_scan_active(scn) ||
|
||||
/*
|
||||
* If the scan is inactive due to a stalled async destroy, try again.
|
||||
*/
|
||||
if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
|
||||
spa_sync_pass(dp->dp_spa) > 1)
|
||||
return;
|
||||
|
||||
@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
spa->spa_scrub_active = B_TRUE;
|
||||
|
||||
/*
|
||||
* First process the free list. If we pause the free, don't do
|
||||
* any scanning. This ensures that there is no free list when
|
||||
* we are scanning, so the scan code doesn't have to worry about
|
||||
* traversing it.
|
||||
* First process the async destroys. If we pause, don't do
|
||||
* any scrubbing or resilvering. This ensures that there are no
|
||||
* async destroys while we are scanning, so the scan code doesn't
|
||||
* have to worry about traversing it. It is also faster to free the
|
||||
* blocks than to scrub them.
|
||||
*/
|
||||
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
|
||||
scn->scn_is_bptree = B_FALSE;
|
||||
@ -1473,48 +1477,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dsl_scan_free_block_cb, scn, tx);
|
||||
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
|
||||
|
||||
if (err == 0 && spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
ASSERT(scn->scn_async_destroying);
|
||||
scn->scn_is_bptree = B_TRUE;
|
||||
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
err = bptree_iterate(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
|
||||
scn, tx);
|
||||
VERIFY0(zio_wait(scn->scn_zio_root));
|
||||
if (err != 0 && err != ERESTART)
|
||||
zfs_panic_recover("error %u from bpobj_iterate()", err);
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
/* finished; deactivate async destroy feature */
|
||||
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
|
||||
tx);
|
||||
ASSERT(!spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY));
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, tx));
|
||||
VERIFY0(bptree_free(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, tx));
|
||||
dp->dp_bptree_obj = 0;
|
||||
scn->scn_async_destroying = B_FALSE;
|
||||
}
|
||||
if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
ASSERT(scn->scn_async_destroying);
|
||||
scn->scn_is_bptree = B_TRUE;
|
||||
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
err = bptree_iterate(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
|
||||
VERIFY0(zio_wait(scn->scn_zio_root));
|
||||
|
||||
if (err == EIO || err == ECKSUM) {
|
||||
err = 0;
|
||||
} else if (err != 0 && err != ERESTART) {
|
||||
zfs_panic_recover("error %u from "
|
||||
"traverse_dataset_destroyed()", err);
|
||||
}
|
||||
if (scn->scn_visited_this_txg) {
|
||||
zfs_dbgmsg("freed %llu blocks in %llums from "
|
||||
"free_bpobj/bptree txg %llu",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)
|
||||
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
|
||||
(longlong_t)tx->tx_txg);
|
||||
scn->scn_visited_this_txg = 0;
|
||||
/*
|
||||
* Re-sync the ddt so that we can further modify
|
||||
* it when doing bprewrite.
|
||||
*/
|
||||
ddt_sync(spa, tx->tx_txg);
|
||||
|
||||
/*
|
||||
* If we didn't make progress, mark the async destroy as
|
||||
* stalled, so that we will not initiate a spa_sync() on
|
||||
* its behalf.
|
||||
*/
|
||||
scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
|
||||
|
||||
if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
|
||||
/* finished; deactivate async destroy feature */
|
||||
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
|
||||
ASSERT(!spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY));
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, tx));
|
||||
VERIFY0(bptree_free(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, tx));
|
||||
dp->dp_bptree_obj = 0;
|
||||
scn->scn_async_destroying = B_FALSE;
|
||||
}
|
||||
if (err == ERESTART)
|
||||
return;
|
||||
}
|
||||
if (scn->scn_visited_this_txg) {
|
||||
zfs_dbgmsg("freed %llu blocks in %llums from "
|
||||
"free_bpobj/bptree txg %llu; err=%u",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)
|
||||
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
|
||||
(longlong_t)tx->tx_txg, err);
|
||||
scn->scn_visited_this_txg = 0;
|
||||
|
||||
/*
|
||||
* Write out changes to the DDT that may be required as a
|
||||
* result of the blocks freed. This ensures that the DDT
|
||||
* is clean when a scrub/resilver runs.
|
||||
*/
|
||||
ddt_sync(spa, tx->tx_txg);
|
||||
}
|
||||
if (err != 0)
|
||||
return;
|
||||
if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
|
||||
(dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
|
||||
dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
|
||||
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
|
||||
/*
|
||||
* We have finished background destroying, but there is still
|
||||
* some space left in the dp_free_dir. Transfer this leaked
|
||||
* space to the dp_leak_dir.
|
||||
*/
|
||||
if (dp->dp_leak_dir == NULL) {
|
||||
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
|
||||
(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
|
||||
LEAK_DIR_NAME, tx);
|
||||
VERIFY0(dsl_pool_open_special_dir(dp,
|
||||
LEAK_DIR_NAME, &dp->dp_leak_dir));
|
||||
rrw_exit(&dp->dp_config_rwlock, FTAG);
|
||||
}
|
||||
dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
|
||||
dp->dp_free_dir->dd_phys->dd_used_bytes,
|
||||
dp->dp_free_dir->dd_phys->dd_compressed_bytes,
|
||||
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
|
||||
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
|
||||
-dp->dp_free_dir->dd_phys->dd_used_bytes,
|
||||
-dp->dp_free_dir->dd_phys->dd_compressed_bytes,
|
||||
-dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
|
||||
}
|
||||
if (!scn->scn_async_destroying) {
|
||||
/* finished; verify that space accounting went to zero */
|
||||
ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
|
||||
ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
|
||||
|
@ -238,19 +238,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
|
||||
}
|
||||
|
||||
if (pool != NULL) {
|
||||
dsl_dir_t *freedir = pool->dp_free_dir;
|
||||
|
||||
/*
|
||||
* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
|
||||
* when opening pools before this version freedir will be NULL.
|
||||
*/
|
||||
if (freedir != NULL) {
|
||||
if (pool->dp_free_dir != NULL) {
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
|
||||
freedir->dd_phys->dd_used_bytes, src);
|
||||
pool->dp_free_dir->dd_phys->dd_used_bytes, src);
|
||||
} else {
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
|
||||
NULL, 0, src);
|
||||
}
|
||||
|
||||
if (pool->dp_leak_dir != NULL) {
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
|
||||
pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
|
||||
} else {
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
|
||||
NULL, 0, src);
|
||||
}
|
||||
}
|
||||
|
||||
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
|
||||
|
@ -29,7 +29,7 @@
|
||||
list_t zfs_dbgmsgs;
|
||||
int zfs_dbgmsg_size;
|
||||
kmutex_t zfs_dbgmsgs_lock;
|
||||
int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
|
||||
int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -44,7 +44,38 @@ int zfs_flags = 0;
|
||||
* This should only be used as a last resort, as it typically results
|
||||
* in leaked space, or worse.
|
||||
*/
|
||||
int zfs_recover = 0;
|
||||
int zfs_recover = B_FALSE;
|
||||
|
||||
/*
|
||||
* If destroy encounters an EIO while reading metadata (e.g. indirect
|
||||
* blocks), space referenced by the missing metadata can not be freed.
|
||||
* Normally this causes the background destroy to become "stalled", as
|
||||
* it is unable to make forward progress. While in this stalled state,
|
||||
* all remaining space to free from the error-encountering filesystem is
|
||||
* "temporarily leaked". Set this flag to cause it to ignore the EIO,
|
||||
* permanently leak the space from indirect blocks that can not be read,
|
||||
* and continue to free everything else that it can.
|
||||
*
|
||||
* The default, "stalling" behavior is useful if the storage partially
|
||||
* fails (i.e. some but not all i/os fail), and then later recovers. In
|
||||
* this case, we will be able to continue pool operations while it is
|
||||
* partially failed, and when it recovers, we can continue to free the
|
||||
* space, with no leaks. However, note that this case is actually
|
||||
* fairly rare.
|
||||
*
|
||||
* Typically pools either (a) fail completely (but perhaps temporarily,
|
||||
* e.g. a top-level vdev going offline), or (b) have localized,
|
||||
* permanent errors (e.g. disk returns the wrong data due to bit flip or
|
||||
* firmware bug). In case (a), this setting does not matter because the
|
||||
* pool will be suspended and the sync thread will not be able to make
|
||||
* forward progress regardless. In case (b), because the error is
|
||||
* permanent, the best we can do is leak the minimum amount of space,
|
||||
* which is what setting this flag will do. Therefore, it is reasonable
|
||||
* for this flag to normally be set, but we chose the more conservative
|
||||
* approach of not setting it, so that there is no possibility of
|
||||
* leaking space in the "partial temporary" failure case.
|
||||
*/
|
||||
int zfs_free_leak_on_eio = B_FALSE;
|
||||
|
||||
|
||||
void
|
||||
@ -163,4 +194,8 @@ MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags");
|
||||
|
||||
module_param(zfs_recover, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors");
|
||||
|
||||
module_param(zfs_free_leak_on_eio, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_free_leak_on_eio,
|
||||
"Set to ignore IO errors during free and permanently leak the space");
|
||||
#endif /* _KERNEL */
|
||||
|
@ -3356,13 +3356,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
|
||||
ASSERT(zb1->zb_objset == zb2->zb_objset);
|
||||
ASSERT(zb2->zb_level == 0);
|
||||
|
||||
/*
|
||||
* A bookmark in the deadlist is considered to be after
|
||||
* everything else.
|
||||
*/
|
||||
if (zb2->zb_object == DMU_DEADLIST_OBJECT)
|
||||
return (B_TRUE);
|
||||
|
||||
/* The objset_phys_t isn't before anything. */
|
||||
if (dnp == NULL)
|
||||
return (B_FALSE);
|
||||
|
Loading…
Reference in New Issue
Block a user