From 4653e2f7d3b00714fd11ff267f7263097eec0718 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 28 Jun 2024 21:07:41 +1000 Subject: [PATCH] dmu_tx: break tx assign/wait when pool suspends This adjusts dmu_tx_assign/dmu_tx_wait to be interruptable if the pool suspends while they're waiting, rather than just on the initial check before falling back into a wait. Since that's not always wanted, add a DMU_TX_SUSPEND flag to ignore suspend entirely, effectively returning to the previous behaviour. With that, it shouldn't be possible for anything with a standard dmu_tx_assign/wait/abort loop to block under failmode=continue. Also should be a bit tighter than the old behaviour, where a VERIFY0(dmu_tx_assign(DMU_TX_WAIT)) could technically fail if the pool is already suspended and failmode=continue. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Paul Dagnelie Signed-off-by: Rob Norris Closes #17355 --- include/sys/dmu.h | 10 ++++- include/sys/dmu_tx.h | 4 ++ module/zfs/dmu_tx.c | 105 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 99 insertions(+), 20 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 324728a2d..bc2aea71e 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -291,13 +291,19 @@ typedef enum { /* * Assign the tx to the open transaction. If the open transaction is * full, or the write throttle is active, block until the next - * transaction and try again. If the pool suspends while waiting, - * return an error. + * transaction and try again. If the pool suspends while waiting + * and failmode=continue, return an error. */ DMU_TX_WAIT = (1 << 0), /* If the write throttle would prevent the assignment, ignore it. */ DMU_TX_NOTHROTTLE = (1 << 1), + + /* + * With DMU_TX_WAIT, always block if the pool suspends during + * assignment, regardless of the value of the failmode= property. + */ + DMU_TX_SUSPEND = (1 << 2), } dmu_tx_flag_t; void byteswap_uint64_array(void *buf, size_t size); diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 3b0aa19c4..ce49a0c49 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #ifndef _SYS_DMU_TX_H @@ -80,6 +81,9 @@ struct dmu_tx { /* has this transaction already been delayed? */ boolean_t tx_dirty_delayed; + /* whether dmu_tx_wait() should return on suspend */ + boolean_t tx_break_on_suspend; + int tx_err; }; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 316501371..c2e6c749f 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include @@ -1017,7 +1017,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) * decreasing performance. */ static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t flags) +dmu_tx_try_assign(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; @@ -1032,19 +1032,10 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t flags) DMU_TX_STAT_BUMP(dmu_tx_suspended); /* - * If the user has indicated a blocking failure mode - * then return ERESTART which will block in dmu_tx_wait(). - * Otherwise, return EIO so that an error can get - * propagated back to the VOP calls. - * - * Note that we always honor the `flags` flag regardless - * of the failuremode setting. + * Let dmu_tx_assign() know specifically what happened, so + * it can make the right choice based on the caller flags. */ - if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && - !(flags & DMU_TX_WAIT)) - return (SET_ERROR(EIO)); - - return (SET_ERROR(ERESTART)); + return (SET_ERROR(ESHUTDOWN)); } if (!tx->tx_dirty_delayed && @@ -1184,6 +1175,12 @@ dmu_tx_unassign(dmu_tx_t *tx) * they have already called dmu_tx_wait() (though most likely on a * different tx). * + * If DMU_TX_SUSPEND is set, this indicates that this tx should ignore + * the pool being or becoming suspending while it is in progress. This will + * cause dmu_tx_assign() (and dmu_tx_wait()) to block until the pool resumes. + * If this flag is not set and the pool suspends, the return will be either + * ERESTART or EIO, depending on the value of the pool's failmode= property. + * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple @@ -1206,7 +1203,8 @@ dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags) int err; ASSERT(tx->tx_txg == 0); - ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE)); + ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); + IMPLY(flags & DMU_TX_SUSPEND, flags & DMU_TX_WAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ @@ -1215,13 +1213,74 @@ dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags) if ((flags & DMU_TX_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; - while ((err = dmu_tx_try_assign(tx, flags)) != 0) { + if (!(flags & DMU_TX_SUSPEND)) + tx->tx_break_on_suspend = B_TRUE; + + while ((err = dmu_tx_try_assign(tx)) != 0) { dmu_tx_unassign(tx); + boolean_t suspended = (err == ESHUTDOWN); + if (suspended) { + /* + * Pool suspended. We need to decide whether to block + * and retry, or return error, depending on the + * caller's flags and the pool config. + */ + if (flags & DMU_TX_SUSPEND) + /* + * The caller expressly does not care about + * suspend, so treat it as a normal retry. + */ + err = SET_ERROR(ERESTART); + else if ((flags & DMU_TX_WAIT) && + spa_get_failmode(tx->tx_pool->dp_spa) == + ZIO_FAILURE_MODE_CONTINUE) + /* + * Caller wants to wait, but pool config is + * overriding that, so return EIO to be + * propagated back to userspace. + */ + err = SET_ERROR(EIO); + else + /* Anything else, we should just block. */ + err = SET_ERROR(ERESTART); + } + + /* + * Return unless we decided to retry, or the caller does not + * want to block. + */ if (err != ERESTART || !(flags & DMU_TX_WAIT)) return (err); + /* + * Wait until there's room in this txg, or until it's been + * synced out and a new one is available. + * + * If we're here because the pool suspended above, then we + * unset tx_break_on_suspend to make sure that if dmu_tx_wait() + * has to fall back to a txg_wait_synced_flags(), it doesn't + * immediately return because the pool is suspended. That would + * then immediately return here, and we'd end up in a busy loop + * until the pool resumes. + * + * On the other hand, if the pool hasn't suspended yet, then it + * should be allowed to break a txg wait if the pool does + * suspend, so we can loop and reassess it in + * dmu_tx_try_assign(). + */ + if (suspended) + tx->tx_break_on_suspend = B_FALSE; + dmu_tx_wait(tx); + + /* + * Reset tx_break_on_suspend for DMU_TX_SUSPEND. We do this + * here so that it's available if we return for some other + * reason, and then the caller calls dmu_tx_wait(). + */ + if (!(flags & DMU_TX_SUSPEND)) + tx->tx_break_on_suspend = B_TRUE; } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1239,6 +1298,16 @@ dmu_tx_wait(dmu_tx_t *tx) ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); + /* + * Break on suspend according to whether or not DMU_TX_SUSPEND was + * supplied to the previous dmu_tx_assign() call. For clients, this + * ensures that after dmu_tx_assign() fails, the followup dmu_tx_wait() + * gets the same behaviour wrt suspend. See also the comments in + * dmu_tx_assign(). + */ + txg_wait_flag_t flags = + (tx->tx_break_on_suspend ? TXG_WAIT_SUSPEND : TXG_WAIT_NONE); + before = gethrtime(); if (tx->tx_wait_dirty) { @@ -1276,7 +1345,7 @@ dmu_tx_wait(dmu_tx_t *tx) * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; @@ -1291,7 +1360,7 @@ dmu_tx_wait(dmu_tx_t *tx) * out a TXG at which point we'll hopefully have synced * a portion of the changes. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); } spa_tx_assign_add_nsecs(spa, gethrtime() - before);