diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index c091b2181..610672665 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -3694,16 +3694,41 @@ top: return (error); } -static void -zfs_putpage_commit_cb(void *arg) +/* Finish page writeback. */ +static inline void +zfs_page_writeback_done(struct page *pp, int err) { - (void) err; - struct page *pp = arg; + if (err != 0) { + /* + * Writeback failed. Re-dirty the page. It was undirtied before + * the IO was issued (in zfs_putpage() or write_cache_pages()). + * The kernel only considers writeback for dirty pages; if we + * don't do this, it is eligible for eviction without being + * written out, which we definitely don't want. + */ +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO + filemap_dirty_folio(page_mapping(pp), page_folio(pp)); +#else + __set_page_dirty_nobuffers(pp); +#endif + } ClearPageError(pp); end_page_writeback(pp); } +/* + * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage() + * for syncing writes. Called when the ZIL itx has been written to the log or + * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure + * is passed as `err`. + */ +static void +zfs_putpage_commit_cb(void *arg, int err) +{ + zfs_page_writeback_done(arg, err); +} + /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. @@ -3857,16 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = dmu_tx_assign(tx, DMU_TX_WAIT); if (err != 0) { dmu_tx_abort(tx); -#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO - filemap_dirty_folio(page_mapping(pp), page_folio(pp)); -#else - __set_page_dirty_nobuffers(pp); -#endif - ClearPageError(pp); - end_page_writeback(pp); + zfs_page_writeback_done(pp, err); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); - return (err); + + /* + * Don't return error for an async writeback; we've re-dirtied + * the page so it will be tried again some other time. + */ + return (for_sync ? err : 0); } va = kmap(pp); @@ -3920,7 +3944,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * ALL, zfs_putpage should do it. * * Summary: - * for_sync: 0=unlock immediately; 1 unlock once on disk + * for_sync: 0=unlock immediately; 1=unlock once on disk * sync_mode: NONE=caller will commit; ALL=we will commit */ boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); @@ -3935,8 +3959,11 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); if (!for_sync) { - ClearPageError(pp); - end_page_writeback(pp); + /* + * Async writeback is logged and written to the DMU, so page + * can now be unlocked. + */ + zfs_page_writeback_done(pp, 0); } dmu_tx_commit(tx); @@ -3944,7 +3971,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); if (need_commit) { - err = zil_commit(zfsvfs->z_log, zp->z_id); + err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW); if (err != 0) { zfs_exit(zfsvfs, FTAG); return (err); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 562353268..d07317b0d 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -107,6 +107,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx) return (error); } +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data); + static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { @@ -116,9 +120,38 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) int error; fstrans_cookie_t cookie; - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (error) - return (error); + /* + * Force dirty pages in the range out to the DMU and the log, ready + * for zil_commit() to write down. + * + * We call write_cache_pages() directly to ensure that zpl_putpage() is + * called with the flags we need. We need WB_SYNC_NONE to avoid a call + * to zil_commit() (since we're doing this as a kind of pre-sync); but + * we do need for_sync so that the pages remain in writeback until + * they're on disk, and so that we get an error if the DMU write fails. + */ + if (filemap_range_has_page(inode->i_mapping, start, end)) { + int for_sync = 1; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + error = + zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync); + if (error != 0) { + /* + * Unclear what state things are in. zfs_putpage() will + * ensure the pages remain dirty if they haven't been + * written down to the DMU, but because there may be + * nothing logged, we can't assume that zfs_sync() -> + * zil_commit() will give us a useful error. It's + * safest if we just error out here. + */ + return (error); + } + } crhold(cr); cookie = spl_fstrans_mark(); @@ -495,15 +528,25 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); - if (zfsvfs->z_log != NULL) - result = -zil_commit(zfsvfs->z_log, zp->z_id); + + if (zfsvfs->z_log != NULL) { + /* + * We don't want to block here if the pool suspends, + * because this is not a syncing op by itself, but + * might be part of one that the caller will + * coordinate. + */ + result = -zil_commit_flags(zfsvfs->z_log, zp->z_id, + ZIL_COMMIT_NOW); + } + zpl_exit(zfsvfs, FTAG); /* - * If zil_commit() failed, it's unclear what state things - * are currently in. putpage() has written back out what - * it can to the DMU, but it may not be on disk. We have - * little choice but to escape. + * If zil_commit_flags() failed, it's unclear what state things + * are currently in. putpage() has written back out what it can + * to the DMU, but it may not be on disk. We have little choice + * but to escape. */ if (result != 0) return (result);