From fbeddd60b79690b6a6ececc9b00b6014d21405aa Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 5 Jun 2014 13:20:08 -0800 Subject: [PATCH] Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol 4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson Reviewed by: Christopher Siden Reviewed by: Adam Leventhal Reviewed by: Dan McDonald Reviewed by: Saso Kiselkov Approved by: Dan McDonald References: https://www.illumos.org/issues/4390 https://github.com/illumos/illumos-gate/commit/7fd05ac Porting notes: Previous stack-reduction efforts in traverse_visitb() caused a fair number of un-mergable pieces of code. This patch should reduce its stack footprint a bit more. The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated using kmem_zalloc() for the purpose of stack reduction. The new global zfs_free_leak_on_eio has been defined as an integer rather than a boolean_t as was the case with the related zfs_recover global. Also, zfs_free_leak_on_eio's definition has been inserted into zfs_debug.c for consistency with the existing definition of zfs_recover. Illumos placed it in spa_misc.c. Ported by: Tim Chase Signed-off-by: Brian Behlendorf Closes #2545 --- include/sys/bptree.h | 3 +- include/sys/dmu.h | 1 - include/sys/dsl_dir.h | 1 + include/sys/dsl_pool.h | 1 + include/sys/dsl_scan.h | 1 + include/sys/fs/zfs.h | 1 + include/sys/zfs_debug.h | 1 + lib/libzfs/libzfs_pool.c | 1 + man/man5/zfs-module-parameters.5 | 37 ++++++++ module/zcommon/zpool_prop.c | 2 + module/zfs/bptree.c | 109 +++++++++++++++++++----- module/zfs/dmu_traverse.c | 127 ++++++++++++--------------- module/zfs/dsl_pool.c | 9 ++ module/zfs/dsl_scan.c | 142 +++++++++++++++++++++---------- module/zfs/spa.c | 14 ++- module/zfs/zfs_debug.c | 39 ++++++++- module/zfs/zio.c | 7 -- 17 files changed, 339 insertions(+), 157 deletions(-) diff --git a/include/sys/bptree.h b/include/sys/bptree.h index 971507211..a533cb949 100644 --- a/include/sys/bptree.h +++ b/include/sys/bptree.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_BPTREE_H @@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +boolean_t bptree_is_empty(objset_t *os, uint64_t obj); void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 89a0e5bd7..aa3e8f25a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) -#define DMU_DEADLIST_OBJECT (-3ULL) /* * artificial blkids for bonus buffer and spill blocks diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index d69d47696..3aa775232 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, #define ORIGIN_DIR_NAME "$ORIGIN" #define XLATION_DIR_NAME "$XLATION" #define FREE_DIR_NAME "$FREE" +#define LEAK_DIR_NAME "$LEAK" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index d5bad8dc1..34dc65ba4 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -87,6 +87,7 @@ typedef struct dsl_pool { struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; struct dsl_dir *dp_free_dir; + struct dsl_dir *dp_leak_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_iput_taskq; diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb85d67d..de6a7d17a 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -116,6 +116,7 @@ typedef struct dsl_scan { /* for freeing blocks */ boolean_t scn_is_bptree; boolean_t scn_async_destroying; + boolean_t scn_async_stalled; /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 5371e12ef..227d8b2fb 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -193,6 +193,7 @@ typedef enum { ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, + ZPOOL_PROP_LEAKED, ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index e51207955..829b37a46 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -48,6 +48,7 @@ extern "C" { extern int zfs_flags; extern int zfs_recover; +extern int zfs_free_leak_on_eio; #define ZFS_DEBUG_DPRINTF (1<<0) #define ZFS_DEBUG_DBUF_VERIFY (1<<1) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index a5d2c28af..fce8fa96d 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: + case ZPOOL_PROP_LEAKED: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_ASHIFT: if (literal) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index d3aa2e6b0..2f87f114c 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -696,6 +696,43 @@ Set additional debugging flags Default value: \fB1\fR. .RE +.sp +.ne 2 +.na +\fBzfs_free_leak_on_eio\fR (int) +.ad +.RS 12n +If destroy encounters an EIO while reading metadata (e.g. indirect +blocks), space referenced by the missing metadata can not be freed. +Normally this causes the background destroy to become "stalled", as +it is unable to make forward progress. While in this stalled state, +all remaining space to free from the error-encountering filesystem is +"temporarily leaked". Set this flag to cause it to ignore the EIO, +permanently leak the space from indirect blocks that can not be read, +and continue to free everything else that it can. + +The default, "stalling" behavior is useful if the storage partially +fails (i.e. some but not all i/os fail), and then later recovers. In +this case, we will be able to continue pool operations while it is +partially failed, and when it recovers, we can continue to free the +space, with no leaks. However, note that this case is actually +fairly rare. + +Typically pools either (a) fail completely (but perhaps temporarily, +e.g. a top-level vdev going offline), or (b) have localized, +permanent errors (e.g. disk returns the wrong data due to bit flip or +firmware bug). In case (a), this setting does not matter because the +pool will be suspended and the sync thread will not be able to make +forward progress regardless. In case (b), because the error is +permanent, the best we can do is leak the minimum amount of space, +which is what setting this flag will do. Therefore, it is reasonable +for this flag to normally be set, but we chose the more conservative +approach of not setting it, so that there is no possibility of +leaking space in the "partial temporary" failure case. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 1173fc0c9..6775c09d3 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -81,6 +81,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "", "FREE"); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREEING"); + zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "LEAKED"); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 83f365864..cbe8d1caa 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) return (dmu_object_free(os, obj, tx)); } +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) { dmu_buf_t *db; bptree_phys_t *bt; - bptree_entry_phys_t bte; + bptree_entry_phys_t *bte; /* * bptree objects are in the pool mos, therefore they can only be @@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); bt = db->db_data; - bte.be_birth_txg = birth_txg; - bte.be_bp = *bp; - bzero(&bte.be_zb, sizeof (bte.be_zb)); - dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE); + bte->be_birth_txg = birth_txg; + bte->be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); + kmem_free(bte, sizeof (*bte)); dmu_buf_will_dirty(db, tx); bt->bt_end++; @@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (err); } +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, void *arg, dmu_tx_t *tx) { + boolean_t ioerr = B_FALSE; int err; uint64_t i; dmu_buf_t *db; @@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, bptree_entry_phys_t bte; int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - ASSERT(!free || i == ba.ba_phys->bt_begin); - err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), &bte, DMU_READ_NO_PREFETCH); if (err != 0) break; - if (zfs_recover) + if (zfs_free_leak_on_eio) flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + i, (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, bte.be_birth_txg, &bte.be_zb, flags, bptree_visit_cb, &ba); if (free) { - if (err == ERESTART) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { /* save bookmark for future resume */ ASSERT3U(bte.be_zb.zb_objset, ==, ZB_DESTROYED_OBJSET); ASSERT0(bte.be_zb.zb_level); dmu_write(os, obj, i * sizeof (bte), sizeof (bte), &bte, tx); - break; - } - if (err != 0) { + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { /* - * We can not properly handle an i/o - * error, because the traversal code - * does not know how to resume from an - * arbitrary bookmark. + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. */ - zfs_panic_recover("error %u from " - "traverse_dataset_destroyed()", err); + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); } - ba.ba_phys->bt_begin++; - (void) dmu_free_range(os, obj, - i * sizeof (bte), sizeof (bte), tx); + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; } } - ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); /* if all blocks are free there should be no used space */ if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + ASSERT0(ba.ba_phys->bt_bytes); ASSERT0(ba.ba_phys->bt_comp); ASSERT0(ba.ba_phys->bt_uncomp); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index e086e2487..fe352469b 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -58,12 +58,11 @@ typedef struct traverse_data { zbookmark_t *td_resume; int td_flags; prefetch_data_t *td_pfd; + boolean_t td_paused; blkptr_cb_t *td_func; void *td_arg; } traverse_data_t; -#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD) - static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, @@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * If we found the block we're trying to resume from, zero * the bookmark out to indicate that we have resumed. */ - ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { bzero(td->td_resume, sizeof (*zb)); if (td->td_flags & TRAVERSE_POST) @@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void -traverse_pause(traverse_data_t *td, const zbookmark_t *zb) -{ - ASSERT(td->td_resume != NULL); - ASSERT0(zb->zb_level); - bcopy(zb, td->td_resume, sizeof (*td->td_resume)); -} - static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_t *zb) @@ -211,9 +201,8 @@ static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) { - int err = 0, lasterr = 0; + int err = 0; arc_buf_t *buf = NULL; - boolean_t pause = B_FALSE; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: @@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - return (err); + if (err != 0) + goto post; + return (0); } if (td->td_pfd && !td->td_pfd->pd_exited && @@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); - if (err == ERESTART) - pause = B_TRUE; /* handle pausing at a common point */ if (err != 0) goto post; } @@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE); @@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &((blkptr_t *)buf->b_data)[i], czb); - if (err != 0) { - if (!TD_HARD(td)) - break; - lasterr = err; - } + if (err != 0) + break; } kmem_free(czb, sizeof (zbookmark_t)); @@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; dnp = buf->b_data; for (i = 0; i < epb; i++) { @@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, for (i = 0; i < epb; i++) { err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); - if (err != 0) { - if (!TD_HARD(td)) - break; - lasterr = err; - } + if (err != 0) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; @@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) - return (err); + goto post; osp = buf->b_data; dnp = &osp->os_meta_dnode; @@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = traverse_dnode(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (err && TD_HARD(td)) { - lasterr = err; - err = 0; - } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_groupused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } - if (err && TD_HARD(td)) { - lasterr = err; - err = 0; - } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_userused_dnode; err = traverse_dnode(td, dnp, zb->zb_objset, @@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, (void) arc_buf_remove_ref(buf, &buf); post: - if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - if (err == ERESTART) - pause = B_TRUE; + + if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) { + /* + * Ignore this disk error as requested by the HARD flag, + * and continue traversal. + */ + err = 0; } - if (pause && td->td_resume != NULL) { - ASSERT3U(err, ==, ERESTART); - ASSERT(!TD_HARD(td)); - traverse_pause(td, zb); + /* + * If we are stopping here, set td_resume. + */ + if (td->td_resume != NULL && err != 0 && !td->td_paused) { + td->td_resume->zb_objset = zb->zb_objset; + td->td_resume->zb_object = zb->zb_object; + td->td_resume->zb_level = 0; + /* + * If we have stopped on an indirect block (e.g. due to + * i/o error), we have not visited anything below it. + * Set the bookmark to the first level-0 block that we need + * to visit. This way, the resuming code does not need to + * deal with resuming from indirect blocks. + */ + td->td_resume->zb_blkid = zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_paused = B_TRUE; } - return (err != 0 ? err : lasterr); + return (err); } static void @@ -426,29 +419,21 @@ static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { - int j, err = 0, lasterr = 0; + int j, err = 0; zbookmark_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); - if (err != 0) { - if (!TD_HARD(td)) - break; - lasterr = err; - } + if (err != 0) + break; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); - if (err != 0) { - if (!TD_HARD(td)) - return (err); - lasterr = err; - } } - return (err != 0 ? err : lasterr); + return (err); } /* ARGSUSED */ @@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td->td_arg = arg; td->td_pfd = pd; td->td_flags = flags; + td->td_paused = B_FALSE; pd->pd_blks_max = zfs_pd_blks_max; pd->pd_flags = flags; @@ -617,7 +603,7 @@ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - int err, lasterr = 0; + int err; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; @@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, return (err); /* visit each dataset */ - for (obj = 1; err == 0 || (err != ESRCH && hard); + for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); if (err != 0) { - if (!hard) - return (err); - lasterr = err; - continue; + if (hard) + continue; + break; } if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { @@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); dsl_pool_config_exit(dp, FTAG); if (err != 0) { - if (!hard) - return (err); - lasterr = err; - continue; + if (hard) + continue; + break; } if (ds->ds_phys->ds_prev_snap_txg > txg) txg = ds->ds_phys->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err != 0) { - if (!hard) - return (err); - lasterr = err; - } + if (err != 0) + break; } } if (err == ESRCH) err = 0; - return (err != 0 ? err : lasterr); + return (err); } #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 91476cc0b..f1ab29c62 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -245,6 +245,13 @@ dsl_pool_open(dsl_pool_t *dp) dp->dp_meta_objset, obj)); } + /* + * Note: errors ignored, because the leak dir will not exist if we + * have not encountered a leak yet. + */ + (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, + &dp->dp_leak_dir); + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, @@ -292,6 +299,8 @@ dsl_pool_close(dsl_pool_t *dp) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir) dsl_dir_rele(dp->dp_free_dir, dp); + if (dp->dp_leak_dir) + dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir) dsl_dir_rele(dp->dp_root_dir, dp); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f03f1f54b..0f0243b31 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ @@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn) if (spa_shutting_down(spa)) return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING || - scn->scn_async_destroying) + (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { @@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - int err; + int err = 0; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_setup_sync(&func, tx); } - if (!dsl_scan_active(scn) || + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) || spa_sync_pass(dp->dp_spa) > 1) return; @@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) spa->spa_scrub_active = B_TRUE; /* - * First process the free list. If we pause the free, don't do - * any scanning. This ensures that there is no free list when - * we are scanning, so the scan code doesn't have to worry about - * traversing it. + * First process the async destroys. If we pause, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; @@ -1473,48 +1477,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); - if (err == 0 && spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)) { - ASSERT(scn->scn_async_destroying); - scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_MUSTSUCCEED); - err = bptree_iterate(dp->dp_meta_objset, - dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, - scn, tx); - VERIFY0(zio_wait(scn->scn_zio_root)); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + } - if (err == 0) { - /* finished; deactivate async destroy feature */ - spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, - tx); - ASSERT(!spa_feature_is_active(spa, - SPA_FEATURE_ASYNC_DESTROY)); - VERIFY0(zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, tx)); - VERIFY0(bptree_free(dp->dp_meta_objset, - dp->dp_bptree_obj, tx)); - dp->dp_bptree_obj = 0; - scn->scn_async_destroying = B_FALSE; - } + if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + ASSERT(scn->scn_async_destroying); + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + + if (err == EIO || err == ECKSUM) { + err = 0; + } else if (err != 0 && err != ERESTART) { + zfs_panic_recover("error %u from " + "traverse_dataset_destroyed()", err); } - if (scn->scn_visited_this_txg) { - zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj/bptree txg %llu", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t) - NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg); - scn->scn_visited_this_txg = 0; - /* - * Re-sync the ddt so that we can further modify - * it when doing bprewrite. - */ - ddt_sync(spa, tx->tx_txg); + + /* + * If we didn't make progress, mark the async destroy as + * stalled, so that we will not initiate a spa_sync() on + * its behalf. + */ + scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); + + if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); + ASSERT(!spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; } - if (err == ERESTART) - return; + } + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj/bptree txg %llu; err=%u", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), + (longlong_t)tx->tx_txg, err); + scn->scn_visited_this_txg = 0; + + /* + * Write out changes to the DDT that may be required as a + * result of the blocks freed. This ensures that the DDT + * is clean when a scrub/resilver runs. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err != 0) + return; + if (!scn->scn_async_destroying && zfs_free_leak_on_eio && + (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 || + dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 || + dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) { + /* + * We have finished background destroying, but there is still + * some space left in the dp_free_dir. Transfer this leaked + * space to the dp_leak_dir. + */ + if (dp->dp_leak_dir == NULL) { + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + LEAK_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + LEAK_DIR_NAME, &dp->dp_leak_dir)); + rrw_exit(&dp->dp_config_rwlock, FTAG); + } + dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, + dp->dp_free_dir->dd_phys->dd_used_bytes, + dp->dp_free_dir->dd_phys->dd_compressed_bytes, + dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + -dp->dp_free_dir->dd_phys->dd_used_bytes, + -dp->dp_free_dir->dd_phys->dd_compressed_bytes, + -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx); + } + if (!scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes); ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b9fa45f82..e4b71ea72 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -238,19 +238,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) } if (pool != NULL) { - dsl_dir_t *freedir = pool->dp_free_dir; - /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ - if (freedir != NULL) { + if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, - freedir->dd_phys->dd_used_bytes, src); + pool->dp_free_dir->dd_phys->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } + + if (pool->dp_leak_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + pool->dp_leak_dir->dd_phys->dd_used_bytes, src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + NULL, 0, src); + } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c index 4f612e16b..47b7834f5 100644 --- a/module/zfs/zfs_debug.c +++ b/module/zfs/zfs_debug.c @@ -29,7 +29,7 @@ list_t zfs_dbgmsgs; int zfs_dbgmsg_size; kmutex_t zfs_dbgmsgs_lock; -int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ #endif /* @@ -44,7 +44,38 @@ int zfs_flags = 0; * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ -int zfs_recover = 0; +int zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +int zfs_free_leak_on_eio = B_FALSE; void @@ -163,4 +194,8 @@ MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); module_param(zfs_recover, int, 0644); MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); + +module_param(zfs_free_leak_on_eio, int, 0644); +MODULE_PARM_DESC(zfs_free_leak_on_eio, + "Set to ignore IO errors during free and permanently leak the space"); #endif /* _KERNEL */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ad97ef5db..58d4550fb 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3356,13 +3356,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, ASSERT(zb1->zb_objset == zb2->zb_objset); ASSERT(zb2->zb_level == 0); - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE);