ZIL: flag crashed LWBs so we know not to process them

If the ZIL crashed, any outstanding LWBs are no longer interesting, so if they return, we need to just clean them up and return, not try to do any work on them. This is true even if they return success, as that may be long after the pool suspended and resumed, depending on when/if the kernel decides to return the IO to us. In particular, we must not try to get the "next" LWB from zl_lwb_list, since they're no longer on that list. So, we put a flag on in-flight LWBs in zil_crash() when we move them from zl_lwb_list to zl_lwb_crash_list, so we know what's going on when they return. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #17622
2026-01-25 10:12:13 +03:00 · 2025-08-12 12:17:33 +10:00 · 2025-08-12 12:17:33 +10:00 · 92da3e18c8
commit 92da3e18c8
parent 508c546975
2 changed files with 19 additions and 7 deletions
--- a/include/sys/zil_impl.h
+++ b/include/sys/zil_impl.h
@ -101,8 +101,9 @@ typedef enum {
 * "zl_lock" is used to protect the lwb against concurrent access.
 */
 typedef enum {
-	LWB_FLAG_SLIM =	(1<<0),		/* log block has slim format */
-	LWB_FLAG_SLOG =	(1<<1),		/* lwb_blk is on SLOG device */
+	LWB_FLAG_SLIM =		(1<<0),	/* log block has slim format */
+	LWB_FLAG_SLOG =		(1<<1),	/* lwb_blk is on SLOG device */
+	LWB_FLAG_CRASHED =	(1<<2),	/* lwb is on the crash list */
 } lwb_flag_t;

 typedef struct lwb {
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@ -1591,7 +1591,7 @@ zil_lwb_write_done(zio_t *zio)
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
-	lwb_t *nlwb;
+	lwb_t *nlwb = NULL;

 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);

@ -1611,9 +1611,11 @@ zil_lwb_write_done(zio_t *zio)
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
-	nlwb = list_next(&zilog->zl_lwb_list, lwb);
-	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
-		nlwb = NULL;
+	if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) {
+		nlwb = list_next(&zilog->zl_lwb_list, lwb);
+		if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+			nlwb = NULL;
+	}
 	mutex_exit(&zilog->zl_lock);

 	if (avl_numnodes(t) == 0)
@ -1631,8 +1633,13 @@ zil_lwb_write_done(zio_t *zio)
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
+	 *
+	 * Note that we treat a "crashed" LWB as though it was in error,
+	 * even if it did appear to succeed, because we've already
+	 * signaled error and cleaned up waiters and committers in
+	 * zil_crash(); we just want to clean up and get out of here.
 	 */
-	if (zio->io_error != 0) {
+	if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
@ -2747,6 +2754,7 @@ zil_crash_clean(zilog_t *zilog, uint64_t synced_txg)
 		}

 		/* This LWB is from the past, so we can clean it up now. */
+		ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED);
 		list_remove(&zilog->zl_lwb_crash_list, lwb);
 		if (lwb->lwb_buf != NULL)
 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
@ -3733,6 +3741,9 @@ zil_crash(zilog_t *zilog)
 	 */
 	for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL;
 	    lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) {
+		ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED));
+		lwb->lwb_flags |= LWB_FLAG_CRASHED;
+
 		itx_t *itx;
 		while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 			zil_itx_destroy(itx, EIO);