Fix i/o error handling of livelists and zap iteration

Pool-wide metadata is stored in the MOS (Meta Object Set). This metadata is stored in triplicate, in addition to any pool-level reduncancy (e.g. RAIDZ). However, if all 3+ copies of this metadata are not available, we can still get EIO/ECKSUM when reading from the MOS. If we encounter such an error in syncing context, we have typically already committed to making a change that we now can't do because of the corrupt/missing metadata. We typically "handle" this with a `VERIFY()` or `zfs_panic_recover()`. This prevents the system from continuing on in an undefined state, while minimizing the amount of error-handling code. However, there are some code paths that ignore these i/o errors, or `ASSERT()` that they don't happen. Since assertions are disabled on non-debug builds, they effectively ignore them as well. This can lead to ZFS continuing on in an incorrect state, potentially leading to on-disk inconsistencies. This commit adds handling for these i/o errors on MOS metadata, typically with a `VERIFY()`: * Handle error return from `zap_cursor_retrieve()` in 4 places in `dsl_deadlist.c`. * Handle error return from `zap_contains()` in `dsl_dir_hold_obj()`. Turns out this call isn't necessary because we can always call `zap_lookup()`. * Handle error return from `zap_lookup()` in `dsl_fs_ss_limit_check()`. * Handle error return from `zap_remove()` in `dsl_dir_rename_sync()`. * Handle error return from `zap_lookup()` in `dsl_dir_remove_livelist()`. * Handle error return from `dsl_process_sub_livelist()` in `spa_livelist_delete_cb()`. Additionally: * Augment the internal history log message for `zfs destroy` to note which method is used (e.g. bptree, livelist, or, synchronous) and the mintxg. * Correct a comment in `dbuf_init()`. * Correct indentation in `dsl_dir_remove_livelist()`. Reviewed by: Sara Hartse <sara.hartse@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #10643
2026-05-24 11:18:52 +03:00 · 2020-08-05 10:22:09 -07:00
parent 1b376d176e
commit d87676a9fa
4 changed files with 91 additions and 69 deletions
@@ -197,25 +197,27 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;

-		if (dsl_dir_is_zapified(dd) &&
-		    zap_contains(dp->dp_meta_objset, ddobj,
-		    DD_FIELD_CRYPTO_KEY_OBJ) == 0) {
-			VERIFY0(zap_lookup(dp->dp_meta_objset,
-			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
-			    sizeof (uint64_t), 1, &dd->dd_crypto_obj));
-
-			/* check for on-disk format errata */
-			if (dsl_dir_incompatible_encryption_version(dd)) {
-				dp->dp_spa->spa_errata =
-				    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
-			}
-		}
-
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
 		dsl_prop_init(dd);

+		if (dsl_dir_is_zapified(dd)) {
+			err = zap_lookup(dp->dp_meta_objset,
+			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
+			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
+			if (err == 0) {
+				/* check for on-disk format errata */
+				if (dsl_dir_incompatible_encryption_version(
+				    dd)) {
+					dp->dp_spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
+				}
+			} else if (err != ENOENT) {
+				goto errout;
+			}
+		}
+
 		dsl_dir_snap_cmtime_update(dd);

 		if (dsl_dir_phys(dd)->dd_parent_obj) {
@@ -863,9 +865,14 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
 	 * stop since we know there is no limit here (or above). The counts are
 	 * not valid on this node and we know we won't touch this node's counts.
 	 */
-	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
-	    count_prop, sizeof (count), 1, &count) == ENOENT)
+	if (!dsl_dir_is_zapified(dd))
 		return (0);
+	err = zap_lookup(os, dd->dd_object,
+	    count_prop, sizeof (count), 1, &count);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);

 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
 	    B_FALSE);
@@ -2047,7 +2054,6 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	const char *mynewname;
-	int error;
 	objset_t *mos = dp->dp_meta_objset;

 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
@@ -2114,10 +2120,9 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);

 	/* remove from old parent zapobj */
-	error = zap_remove(mos,
+	VERIFY0(zap_remove(mos,
 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
-	    dd->dd_myname, tx);
-	ASSERT0(error);
+	    dd->dd_myname, tx));

 	(void) strlcpy(dd->dd_myname, mynewname,
 	    sizeof (dd->dd_myname));
@@ -2259,49 +2264,45 @@ dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL &&
 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
-			/*
-			 * We use zthr_wait_cycle_done instead of zthr_cancel
-			 * because we don't want to destroy the zthr, just have
-			 * it skip its current task.
-			 */
-			spa->spa_to_condense.cancelled = B_TRUE;
-			zthr_wait_cycle_done(ll_condense_thread);
-			/*
-			 * If we've returned from zthr_wait_cycle_done without
-			 * clearing the to_condense data structure it's either
-			 * because the no-wait synctask has started (which is
-			 * indicated by 'syncing' field of to_condense) and we
-			 * can expect it to clear to_condense on its own.
-			 * Otherwise, we returned before the zthr ran. The
-			 * checkfunc will now fail as cancelled == B_TRUE so we
-			 * can safely NULL out ds, allowing a different dir's
-			 * livelist to be condensed.
-			 *
-			 * We can be sure that the to_condense struct will not
-			 * be repopulated at this stage because both this
-			 * function and dsl_livelist_try_condense execute in
-			 * syncing context.
-			 */
-			if ((spa->spa_to_condense.ds != NULL) &&
-			    !spa->spa_to_condense.syncing) {
-				dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
-				    spa);
-				spa->spa_to_condense.ds = NULL;
-			}
+		/*
+		 * We use zthr_wait_cycle_done instead of zthr_cancel
+		 * because we don't want to destroy the zthr, just have
+		 * it skip its current task.
+		 */
+		spa->spa_to_condense.cancelled = B_TRUE;
+		zthr_wait_cycle_done(ll_condense_thread);
+		/*
+		 * If we've returned from zthr_wait_cycle_done without
+		 * clearing the to_condense data structure it's either
+		 * because the no-wait synctask has started (which is
+		 * indicated by 'syncing' field of to_condense) and we
+		 * can expect it to clear to_condense on its own.
+		 * Otherwise, we returned before the zthr ran. The
+		 * checkfunc will now fail as cancelled == B_TRUE so we
+		 * can safely NULL out ds, allowing a different dir's
+		 * livelist to be condensed.
+		 *
+		 * We can be sure that the to_condense struct will not
+		 * be repopulated at this stage because both this
+		 * function and dsl_livelist_try_condense execute in
+		 * syncing context.
+		 */
+		if ((spa->spa_to_condense.ds != NULL) &&
+		    !spa->spa_to_condense.syncing) {
+			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
+			    spa);
+			spa->spa_to_condense.ds = NULL;
+		}
 	}

 	dsl_dir_livelist_close(dd);
-	int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
-	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
-	if (err == 0) {
-		VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
-		    DD_FIELD_LIVELIST, tx));
-		if (total) {
-			dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
-			spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
-		}
-	} else {
-		ASSERT3U(err, !=, ENOENT);
+	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
+	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_LIVELIST, tx));
+	if (total) {
+		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
+		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	}
 }