Linux: zfs_putpage: handle page writeback errors

Page writeback is considered completed when the associated itx callback
completes. A syncing writeback will receive the error in its callback
directly, but an in-flight async writeback that was promoted to sync by
the ZIL may also receive an error.

Writeback errors, even syncing writeback errors, are not especially
serious on their own, because the error will ultimately be returned to
the zil_commit() caller, either zfs_fsync() for an explicit sync op (eg
msync()) or to zfs_putpage() itself for a syncing (WB_SYNC_ALL) writeback
(kernel housekeeping or sync_file_range(SYNC_FILE_RANGE_WAIT_AFTER).

The only thing we need to do when a page writeback fails is to re-mark
the page dirty, since we don't know if it made it to disk yet. This will
ensure that it gets written out again in the future, either some
scheduled async writeback or another explicit syncing call.

On the other side, we need to make sure that if a syncing op arrives,
any changes on dirty pages are written back to the DMU and/or the ZIL
first. We do this by starting an _async_ (WB_SYNC_NONE) writeback on the
file mapping at the start of the sync op (fsync(), msync(), etc). An
async op will get an async itx created and logged, ready for the
followup zfs_fsync()->zil_commit() to find, while avoiding a zil_commit()
call for every page in the range.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #17398
This commit is contained in:
Rob Norris 2025-06-04 19:20:24 +10:00 committed by Brian Behlendorf
parent 391e85f519
commit 3d6ee9a68c
2 changed files with 95 additions and 25 deletions

View File

@ -3694,16 +3694,41 @@ top:
return (error);
}
static void
zfs_putpage_commit_cb(void *arg)
/* Finish page writeback. */
static inline void
zfs_page_writeback_done(struct page *pp, int err)
{
(void) err;
struct page *pp = arg;
if (err != 0) {
/*
* Writeback failed. Re-dirty the page. It was undirtied before
* the IO was issued (in zfs_putpage() or write_cache_pages()).
* The kernel only considers writeback for dirty pages; if we
* don't do this, it is eligible for eviction without being
* written out, which we definitely don't want.
*/
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
#else
__set_page_dirty_nobuffers(pp);
#endif
}
ClearPageError(pp);
end_page_writeback(pp);
}
/*
* ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage()
* for syncing writes. Called when the ZIL itx has been written to the log or
* the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure
* is passed as `err`.
*/
static void
zfs_putpage_commit_cb(void *arg, int err)
{
zfs_page_writeback_done(arg, err);
}
/*
* Push a page out to disk, once the page is on stable storage the
* registered commit callback will be run as notification of completion.
@ -3857,16 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
err = dmu_tx_assign(tx, DMU_TX_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
#else
__set_page_dirty_nobuffers(pp);
#endif
ClearPageError(pp);
end_page_writeback(pp);
zfs_page_writeback_done(pp, err);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (err);
/*
* Don't return error for an async writeback; we've re-dirtied
* the page so it will be tried again some other time.
*/
return (for_sync ? err : 0);
}
va = kmap(pp);
@ -3920,7 +3944,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* ALL, zfs_putpage should do it.
*
* Summary:
* for_sync: 0=unlock immediately; 1 unlock once on disk
* for_sync: 0=unlock immediately; 1=unlock once on disk
* sync_mode: NONE=caller will commit; ALL=we will commit
*/
boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE);
@ -3935,8 +3959,11 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp);
if (!for_sync) {
ClearPageError(pp);
end_page_writeback(pp);
/*
* Async writeback is logged and written to the DMU, so page
* can now be unlocked.
*/
zfs_page_writeback_done(pp, 0);
}
dmu_tx_commit(tx);
@ -3944,7 +3971,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
zfs_rangelock_exit(lr);
if (need_commit) {
err = zil_commit(zfsvfs->z_log, zp->z_id);
err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW);
if (err != 0) {
zfs_exit(zfsvfs, FTAG);
return (err);

View File

@ -107,6 +107,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
return (error);
}
static inline int
zpl_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, void *data);
static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
@ -116,9 +120,38 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
int error;
fstrans_cookie_t cookie;
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
return (error);
/*
* Force dirty pages in the range out to the DMU and the log, ready
* for zil_commit() to write down.
*
* We call write_cache_pages() directly to ensure that zpl_putpage() is
* called with the flags we need. We need WB_SYNC_NONE to avoid a call
* to zil_commit() (since we're doing this as a kind of pre-sync); but
* we do need for_sync so that the pages remain in writeback until
* they're on disk, and so that we get an error if the DMU write fails.
*/
if (filemap_range_has_page(inode->i_mapping, start, end)) {
int for_sync = 1;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
error =
zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync);
if (error != 0) {
/*
* Unclear what state things are in. zfs_putpage() will
* ensure the pages remain dirty if they haven't been
* written down to the DMU, but because there may be
* nothing logged, we can't assume that zfs_sync() ->
* zil_commit() will give us a useful error. It's
* safest if we just error out here.
*/
return (error);
}
}
crhold(cr);
cookie = spl_fstrans_mark();
@ -495,15 +528,25 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (sync_mode != wbc->sync_mode) {
if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (result);
if (zfsvfs->z_log != NULL)
result = -zil_commit(zfsvfs->z_log, zp->z_id);
if (zfsvfs->z_log != NULL) {
/*
* We don't want to block here if the pool suspends,
* because this is not a syncing op by itself, but
* might be part of one that the caller will
* coordinate.
*/
result = -zil_commit_flags(zfsvfs->z_log, zp->z_id,
ZIL_COMMIT_NOW);
}
zpl_exit(zfsvfs, FTAG);
/*
* If zil_commit() failed, it's unclear what state things
* are currently in. putpage() has written back out what
* it can to the DMU, but it may not be on disk. We have
* little choice but to escape.
* If zil_commit_flags() failed, it's unclear what state things
* are currently in. putpage() has written back out what it can
* to the DMU, but it may not be on disk. We have little choice
* but to escape.
*/
if (result != 0)
return (result);