Linux: zfs_putpage: handle page writeback errors

Page writeback is considered completed when the associated itx callback completes. A syncing writeback will receive the error in its callback directly, but an in-flight async writeback that was promoted to sync by the ZIL may also receive an error. Writeback errors, even syncing writeback errors, are not especially serious on their own, because the error will ultimately be returned to the zil_commit() caller, either zfs_fsync() for an explicit sync op (eg msync()) or to zfs_putpage() itself for a syncing (WB_SYNC_ALL) writeback (kernel housekeeping or sync_file_range(SYNC_FILE_RANGE_WAIT_AFTER). The only thing we need to do when a page writeback fails is to re-mark the page dirty, since we don't know if it made it to disk yet. This will ensure that it gets written out again in the future, either some scheduled async writeback or another explicit syncing call. On the other side, we need to make sure that if a syncing op arrives, any changes on dirty pages are written back to the DMU and/or the ZIL first. We do this by starting an _async_ (WB_SYNC_NONE) writeback on the file mapping at the start of the sync op (fsync(), msync(), etc). An async op will get an async itx created and logged, ready for the followup zfs_fsync()->zil_commit() to find, while avoiding a zil_commit() call for every page in the range. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #17398
2026-03-14 06:16:17 +03:00 · 2025-06-04 19:20:24 +10:00 · 2025-06-04 19:20:24 +10:00 · 3d6ee9a68c
commit 3d6ee9a68c
parent 391e85f519
2 changed files with 95 additions and 25 deletions
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@ -3694,16 +3694,41 @@ top:
 	return (error);
 }

-static void
-zfs_putpage_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err)
 {
-	(void) err;
-	struct page *pp = arg;
+	if (err != 0) {
+		/*
+		 * Writeback failed. Re-dirty the page. It was undirtied before
+		 * the IO was issued (in zfs_putpage() or write_cache_pages()).
+		 * The kernel only considers writeback for dirty pages; if we
+		 * don't do this, it is eligible for eviction without being
+		 * written out, which we definitely don't want.
+		 */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+		__set_page_dirty_nobuffers(pp);
+#endif
+	}

 	ClearPageError(pp);
 	end_page_writeback(pp);
 }

+/*
+ * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
+ * for syncing writes. Called when the ZIL itx has been written to the log or
+ * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
+ * is passed as `err`.
+ */
+static void
+zfs_putpage_commit_cb(void *arg, int err)
+{
+	zfs_page_writeback_done(arg, err);
+}
+
 /*
 * Push a page out to disk, once the page is on stable storage the
 * registered commit callback will be run as notification of completion.
@ -3857,16 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
-		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
-		__set_page_dirty_nobuffers(pp);
-#endif
-		ClearPageError(pp);
-		end_page_writeback(pp);
+		zfs_page_writeback_done(pp, err);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
-		return (err);
+
+		/*
+		 * Don't return error for an async writeback; we've re-dirtied
+		 * the page so it will be tried again some other time.
+		 */
+		return (for_sync ? err : 0);
 	}

 	va = kmap(pp);
@ -3920,7 +3944,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	 * ALL, zfs_putpage should do it.
 	 *
 	 * Summary:
-	 *   for_sync:  0=unlock immediately; 1 unlock once on disk
+	 *   for_sync:  0=unlock immediately; 1=unlock once on disk
 	 *   sync_mode: NONE=caller will commit; ALL=we will commit
 	 */
 	boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
@ -3935,8 +3959,11 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	    B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);

 	if (!for_sync) {
-		ClearPageError(pp);
-		end_page_writeback(pp);
+		/*
+		 * Async writeback is logged and written to the DMU, so page
+		 * can now be unlocked.
+		 */
+		zfs_page_writeback_done(pp, 0);
 	}

 	dmu_tx_commit(tx);
@ -3944,7 +3971,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	zfs_rangelock_exit(lr);

 	if (need_commit) {
-		err = zil_commit(zfsvfs->z_log, zp->z_id);
+		err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
 		if (err != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@ -107,6 +107,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
 	return (error);
 }

+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data);
+
 static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
@ -116,9 +120,38 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	int error;
 	fstrans_cookie_t cookie;

-	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (error)
-		return (error);
+	/*
+	 * Force dirty pages in the range out to the DMU and the log, ready
+	 * for zil_commit() to write down.
+	 *
+	 * We call write_cache_pages() directly to ensure that zpl_putpage() is
+	 * called with the flags we need. We need WB_SYNC_NONE to avoid a call
+	 * to zil_commit() (since we're doing this as a kind of pre-sync); but
+	 * we do need for_sync so that the pages remain in writeback until
+	 * they're on disk, and so that we get an error if the DMU write fails.
+	 */
+	if (filemap_range_has_page(inode->i_mapping, start, end)) {
+		int for_sync = 1;
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_NONE,
+			.nr_to_write = LONG_MAX,
+			.range_start = start,
+			.range_end = end,
+		};
+		error =
+		    zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
+		if (error != 0) {
+			/*
+			 * Unclear what state things are in. zfs_putpage() will
+			 * ensure the pages remain dirty if they haven't been
+			 * written down to the DMU, but because there may be
+			 * nothing logged, we can't assume that zfs_sync() ->
+			 * zil_commit() will give us a useful error. It's
+			 * safest if we just error out here.
+			 */
+			return (error);
+		}
+	}

 	crhold(cr);
 	cookie = spl_fstrans_mark();
@ -495,15 +528,25 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	if (sync_mode != wbc->sync_mode) {
 		if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (result);
-		if (zfsvfs->z_log != NULL)
-			result = -zil_commit(zfsvfs->z_log, zp->z_id);
+
+		if (zfsvfs->z_log != NULL) {
+			/*
+			 * We don't want to block here if the pool suspends,
+			 * because this is not a syncing op by itself, but
+			 * might be part of one that the caller will
+			 * coordinate.
+			 */
+			result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
+			    ZIL_COMMIT_NOW);
+		}
+
 		zpl_exit(zfsvfs, FTAG);

 		/*
-		 * If zil_commit() failed, it's unclear what state things
-		 * are currently in. putpage() has written back out what
-		 * it can to the DMU, but it may not be on disk. We have
-		 * little choice but to escape.
+		 * If zil_commit_flags() failed, it's unclear what state things
+		 * are currently in. putpage() has written back out what it can
+		 * to the DMU, but it may not be on disk. We have little choice
+		 * but to escape.
 		 */
 		if (result != 0)
 			return (result);