From 7ddc1f737f618a2539e06c146f9f23fa37412a91 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 26 Jul 2024 09:53:59 +1000 Subject: [PATCH] zil: add stats for commit failure/fallback (#16315) There's no good way to tell when a ZIL commit fails and falls back to a transaction sync, other than perhaps a throughput drop. This adds counters so we can see when it happens and why. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- cmd/zilstat.in | 3 +++ include/sys/zil.h | 18 ++++++++++++++++++ module/zfs/zil.c | 19 +++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/cmd/zilstat.in b/cmd/zilstat.in index e8678e20c..6be7f8393 100755 --- a/cmd/zilstat.in +++ b/cmd/zilstat.in @@ -43,6 +43,9 @@ cols = { "obj": [12, -1, "objset"], "cc": [5, 1000, "zil_commit_count"], "cwc": [5, 1000, "zil_commit_writer_count"], + "cec": [5, 1000, "zil_commit_error_count"], + "csc": [5, 1000, "zil_commit_stall_count"], + "cSc": [5, 1000, "zil_commit_suspend_count"], "ic": [5, 1000, "zil_itx_count"], "iic": [5, 1000, "zil_itx_indirect_count"], "iib": [5, 1024, "zil_itx_indirect_bytes"], diff --git a/include/sys/zil.h b/include/sys/zil.h index 4747ecc06..384678b22 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -467,6 +467,21 @@ typedef struct zil_stats { */ kstat_named_t zil_commit_writer_count; + /* + * Number of times a ZIL commit failed and the ZIL was forced to fall + * back to txg_wait_synced(). The separate counts are for different + * reasons: + * - error: ZIL IO (write/flush) returned an error + * (see zil_commit_impl()) + * - stall: LWB block allocation failed, ZIL chain abandoned + * (see zil_commit_writer_stall()) + * - suspend: ZIL suspended + * (see zil_commit(), zil_get_commit_list()) + */ + kstat_named_t zil_commit_error_count; + kstat_named_t zil_commit_stall_count; + kstat_named_t zil_commit_suspend_count; + /* * Number of transactions (reads, writes, renames, etc.) * that have been committed. @@ -510,6 +525,9 @@ typedef struct zil_stats { typedef struct zil_sums { wmsum_t zil_commit_count; wmsum_t zil_commit_writer_count; + wmsum_t zil_commit_error_count; + wmsum_t zil_commit_stall_count; + wmsum_t zil_commit_suspend_count; wmsum_t zil_itx_count; wmsum_t zil_itx_indirect_count; wmsum_t zil_itx_indirect_bytes; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 34be54b33..12a5e952b 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -99,6 +99,9 @@ static uint_t zfs_commit_timeout_pct = 10; static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, + { "zil_commit_error_count", KSTAT_DATA_UINT64 }, + { "zil_commit_stall_count", KSTAT_DATA_UINT64 }, + { "zil_commit_suspend_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, @@ -360,6 +363,9 @@ zil_sums_init(zil_sums_t *zs) { wmsum_init(&zs->zil_commit_count, 0); wmsum_init(&zs->zil_commit_writer_count, 0); + wmsum_init(&zs->zil_commit_error_count, 0); + wmsum_init(&zs->zil_commit_stall_count, 0); + wmsum_init(&zs->zil_commit_suspend_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); @@ -382,6 +388,9 @@ zil_sums_fini(zil_sums_t *zs) { wmsum_fini(&zs->zil_commit_count); wmsum_fini(&zs->zil_commit_writer_count); + wmsum_fini(&zs->zil_commit_error_count); + wmsum_fini(&zs->zil_commit_stall_count); + wmsum_fini(&zs->zil_commit_suspend_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); @@ -406,6 +415,12 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_commit_count); zs->zil_commit_writer_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_writer_count); + zs->zil_commit_error_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_error_count); + zs->zil_commit_stall_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_stall_count); + zs->zil_commit_suspend_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_suspend_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = @@ -2823,6 +2838,7 @@ zil_commit_writer_stall(zilog_t *zilog) * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ZIL_STAT_BUMP(zilog, zil_commit_stall_count); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT(list_is_empty(&zilog->zl_lwb_list)); } @@ -3592,6 +3608,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { + ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); txg_wait_synced(zilog->zl_dmu_pool, 0); return; } @@ -3645,10 +3662,12 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ + ZIL_STAT_BUMP(zilog, zil_commit_error_count); DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } else if (wtxg != 0) { + ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); txg_wait_synced(zilog->zl_dmu_pool, wtxg); }