mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-25 11:47:43 +03:00
Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/4390 https://github.com/illumos/illumos-gate/commit/7fd05ac Porting notes: Previous stack-reduction efforts in traverse_visitb() caused a fair number of un-mergable pieces of code. This patch should reduce its stack footprint a bit more. The new local bptree_entry_phys_t in bptree_add() is dynamically-allocated using kmem_zalloc() for the purpose of stack reduction. The new global zfs_free_leak_on_eio has been defined as an integer rather than a boolean_t as was the case with the related zfs_recover global. Also, zfs_free_leak_on_eio's definition has been inserted into zfs_debug.c for consistency with the existing definition of zfs_recover. Illumos placed it in spa_misc.c. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2545
This commit is contained in:
committed by
Brian Behlendorf
parent
9b67f60560
commit
fbeddd60b7
+95
-47
@@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
|
||||
int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
|
||||
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
|
||||
int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
|
||||
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
|
||||
int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
|
||||
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
|
||||
int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
|
||||
|
||||
@@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn)
|
||||
if (spa_shutting_down(spa))
|
||||
return (B_FALSE);
|
||||
if (scn->scn_phys.scn_state == DSS_SCANNING ||
|
||||
scn->scn_async_destroying)
|
||||
(scn->scn_async_destroying && !scn->scn_async_stalled))
|
||||
return (B_TRUE);
|
||||
|
||||
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
|
||||
@@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_scan_t *scn = dp->dp_scan;
|
||||
spa_t *spa = dp->dp_spa;
|
||||
int err;
|
||||
int err = 0;
|
||||
|
||||
/*
|
||||
* Check for scn_restart_txg before checking spa_load_state, so
|
||||
@@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dsl_scan_setup_sync(&func, tx);
|
||||
}
|
||||
|
||||
if (!dsl_scan_active(scn) ||
|
||||
/*
|
||||
* If the scan is inactive due to a stalled async destroy, try again.
|
||||
*/
|
||||
if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
|
||||
spa_sync_pass(dp->dp_spa) > 1)
|
||||
return;
|
||||
|
||||
@@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
spa->spa_scrub_active = B_TRUE;
|
||||
|
||||
/*
|
||||
* First process the free list. If we pause the free, don't do
|
||||
* any scanning. This ensures that there is no free list when
|
||||
* we are scanning, so the scan code doesn't have to worry about
|
||||
* traversing it.
|
||||
* First process the async destroys. If we pause, don't do
|
||||
* any scrubbing or resilvering. This ensures that there are no
|
||||
* async destroys while we are scanning, so the scan code doesn't
|
||||
* have to worry about traversing it. It is also faster to free the
|
||||
* blocks than to scrub them.
|
||||
*/
|
||||
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
|
||||
scn->scn_is_bptree = B_FALSE;
|
||||
@@ -1473,48 +1477,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
dsl_scan_free_block_cb, scn, tx);
|
||||
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
|
||||
|
||||
if (err == 0 && spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
ASSERT(scn->scn_async_destroying);
|
||||
scn->scn_is_bptree = B_TRUE;
|
||||
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
err = bptree_iterate(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
|
||||
scn, tx);
|
||||
VERIFY0(zio_wait(scn->scn_zio_root));
|
||||
if (err != 0 && err != ERESTART)
|
||||
zfs_panic_recover("error %u from bpobj_iterate()", err);
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
/* finished; deactivate async destroy feature */
|
||||
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
|
||||
tx);
|
||||
ASSERT(!spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY));
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, tx));
|
||||
VERIFY0(bptree_free(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, tx));
|
||||
dp->dp_bptree_obj = 0;
|
||||
scn->scn_async_destroying = B_FALSE;
|
||||
}
|
||||
if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
|
||||
ASSERT(scn->scn_async_destroying);
|
||||
scn->scn_is_bptree = B_TRUE;
|
||||
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
|
||||
NULL, ZIO_FLAG_MUSTSUCCEED);
|
||||
err = bptree_iterate(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
|
||||
VERIFY0(zio_wait(scn->scn_zio_root));
|
||||
|
||||
if (err == EIO || err == ECKSUM) {
|
||||
err = 0;
|
||||
} else if (err != 0 && err != ERESTART) {
|
||||
zfs_panic_recover("error %u from "
|
||||
"traverse_dataset_destroyed()", err);
|
||||
}
|
||||
if (scn->scn_visited_this_txg) {
|
||||
zfs_dbgmsg("freed %llu blocks in %llums from "
|
||||
"free_bpobj/bptree txg %llu",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)
|
||||
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
|
||||
(longlong_t)tx->tx_txg);
|
||||
scn->scn_visited_this_txg = 0;
|
||||
/*
|
||||
* Re-sync the ddt so that we can further modify
|
||||
* it when doing bprewrite.
|
||||
*/
|
||||
ddt_sync(spa, tx->tx_txg);
|
||||
|
||||
/*
|
||||
* If we didn't make progress, mark the async destroy as
|
||||
* stalled, so that we will not initiate a spa_sync() on
|
||||
* its behalf.
|
||||
*/
|
||||
scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
|
||||
|
||||
if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
|
||||
/* finished; deactivate async destroy feature */
|
||||
spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
|
||||
ASSERT(!spa_feature_is_active(spa,
|
||||
SPA_FEATURE_ASYNC_DESTROY));
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset,
|
||||
DMU_POOL_DIRECTORY_OBJECT,
|
||||
DMU_POOL_BPTREE_OBJ, tx));
|
||||
VERIFY0(bptree_free(dp->dp_meta_objset,
|
||||
dp->dp_bptree_obj, tx));
|
||||
dp->dp_bptree_obj = 0;
|
||||
scn->scn_async_destroying = B_FALSE;
|
||||
}
|
||||
if (err == ERESTART)
|
||||
return;
|
||||
}
|
||||
if (scn->scn_visited_this_txg) {
|
||||
zfs_dbgmsg("freed %llu blocks in %llums from "
|
||||
"free_bpobj/bptree txg %llu; err=%u",
|
||||
(longlong_t)scn->scn_visited_this_txg,
|
||||
(longlong_t)
|
||||
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
|
||||
(longlong_t)tx->tx_txg, err);
|
||||
scn->scn_visited_this_txg = 0;
|
||||
|
||||
/*
|
||||
* Write out changes to the DDT that may be required as a
|
||||
* result of the blocks freed. This ensures that the DDT
|
||||
* is clean when a scrub/resilver runs.
|
||||
*/
|
||||
ddt_sync(spa, tx->tx_txg);
|
||||
}
|
||||
if (err != 0)
|
||||
return;
|
||||
if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
|
||||
(dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
|
||||
dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
|
||||
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
|
||||
/*
|
||||
* We have finished background destroying, but there is still
|
||||
* some space left in the dp_free_dir. Transfer this leaked
|
||||
* space to the dp_leak_dir.
|
||||
*/
|
||||
if (dp->dp_leak_dir == NULL) {
|
||||
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
|
||||
(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
|
||||
LEAK_DIR_NAME, tx);
|
||||
VERIFY0(dsl_pool_open_special_dir(dp,
|
||||
LEAK_DIR_NAME, &dp->dp_leak_dir));
|
||||
rrw_exit(&dp->dp_config_rwlock, FTAG);
|
||||
}
|
||||
dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
|
||||
dp->dp_free_dir->dd_phys->dd_used_bytes,
|
||||
dp->dp_free_dir->dd_phys->dd_compressed_bytes,
|
||||
dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
|
||||
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
|
||||
-dp->dp_free_dir->dd_phys->dd_used_bytes,
|
||||
-dp->dp_free_dir->dd_phys->dd_compressed_bytes,
|
||||
-dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
|
||||
}
|
||||
if (!scn->scn_async_destroying) {
|
||||
/* finished; verify that space accounting went to zero */
|
||||
ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
|
||||
ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
|
||||
|
||||
Reference in New Issue
Block a user