mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs simultaneously
Page writebacks with WB_SYNC_NONE can take several seconds to complete since they wait for the transaction group to close before being committed. This is usually not a problem since the caller does not need to wait. However, if we're simultaneously doing a writeback with WB_SYNC_ALL (e.g via msync), the latter can block for several seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE writeback since it needs to wait for the transaction to complete and the PG_writeback bit to be cleared. This commit deals with 2 cases: - No page writeback is active. A WB_SYNC_ALL page writeback starts and even completes. But when it's about to check if the PG_writeback bit has been cleared, another writeback with WB_SYNC_NONE starts. The sync page writeback ends up waiting for the non-sync page writeback to complete. - A page writeback with WB_SYNC_NONE is already active when a WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up waiting for the WB_SYNC_NONE writeback. The fix works by carefully keeping track of active sync/non-sync writebacks and committing when beneficial. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Shaan Nobee <sniper111@gmail.com> Closes #12662 Closes #12790
This commit is contained in:
@@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
|
||||
zp->z_pflags = 0;
|
||||
zp->z_mode = 0;
|
||||
zp->z_sync_cnt = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
ip->i_generation = 0;
|
||||
ip->i_ino = id;
|
||||
ip->i_mode = (S_IFDIR | S_IRWXUGO);
|
||||
|
||||
@@ -3396,7 +3396,7 @@ top:
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_putpage_commit_cb(void *arg)
|
||||
zfs_putpage_sync_commit_cb(void *arg)
|
||||
{
|
||||
struct page *pp = arg;
|
||||
|
||||
@@ -3404,13 +3404,26 @@ zfs_putpage_commit_cb(void *arg)
|
||||
end_page_writeback(pp);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_putpage_async_commit_cb(void *arg)
|
||||
{
|
||||
struct page *pp = arg;
|
||||
znode_t *zp = ITOZ(pp->mapping->host);
|
||||
|
||||
ClearPageError(pp);
|
||||
end_page_writeback(pp);
|
||||
atomic_dec_32(&zp->z_async_writes_cnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Push a page out to disk, once the page is on stable storage the
|
||||
* registered commit callback will be run as notification of completion.
|
||||
*
|
||||
* IN: ip - page mapped for inode.
|
||||
* pp - page to push (page is locked)
|
||||
* wbc - writeback control data
|
||||
* IN: ip - page mapped for inode.
|
||||
* pp - page to push (page is locked)
|
||||
* wbc - writeback control data
|
||||
* for_sync - does the caller intend to wait synchronously for the
|
||||
* page writeback to complete?
|
||||
*
|
||||
* RETURN: 0 if success
|
||||
* error code if failure
|
||||
@@ -3419,7 +3432,8 @@ zfs_putpage_commit_cb(void *arg)
|
||||
* ip - ctime|mtime updated
|
||||
*/
|
||||
int
|
||||
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
boolean_t for_sync)
|
||||
{
|
||||
znode_t *zp = ITOZ(ip);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
@@ -3517,6 +3531,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
if (wbc->sync_mode != WB_SYNC_NONE) {
|
||||
/*
|
||||
* Speed up any non-sync page writebacks since
|
||||
* they may take several seconds to complete.
|
||||
* Refer to the comment in zpl_fsync() (when
|
||||
* HAVE_FSYNC_RANGE is defined) for details.
|
||||
*/
|
||||
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
}
|
||||
|
||||
if (PageWriteback(pp))
|
||||
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
|
||||
folio_wait_bit(page_folio(pp), PG_writeback);
|
||||
@@ -3542,6 +3566,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
* was in fact not skipped and should not be counted as if it were.
|
||||
*/
|
||||
wbc->pages_skipped--;
|
||||
if (!for_sync)
|
||||
atomic_inc_32(&zp->z_async_writes_cnt);
|
||||
set_page_writeback(pp);
|
||||
unlock_page(pp);
|
||||
|
||||
@@ -3563,6 +3589,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
#endif
|
||||
ClearPageError(pp);
|
||||
end_page_writeback(pp);
|
||||
if (!for_sync)
|
||||
atomic_dec_32(&zp->z_async_writes_cnt);
|
||||
zfs_rangelock_exit(lr);
|
||||
ZFS_EXIT(zfsvfs);
|
||||
return (err);
|
||||
@@ -3587,7 +3615,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
|
||||
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
|
||||
zfs_putpage_commit_cb, pp);
|
||||
for_sync ? zfs_putpage_sync_commit_cb :
|
||||
zfs_putpage_async_commit_cb, pp);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
zfs_rangelock_exit(lr);
|
||||
@@ -3599,6 +3629,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
||||
* performance reasons.
|
||||
*/
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
|
||||
/*
|
||||
* If the caller does not intend to wait synchronously
|
||||
* for this page writeback to complete and there are active
|
||||
* synchronous calls on this file, do a commit so that
|
||||
* the latter don't accidentally end up waiting for
|
||||
* our writeback to complete. Refer to the comment in
|
||||
* zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
|
||||
*/
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
}
|
||||
|
||||
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
|
||||
|
||||
@@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
|
||||
zp->z_acl_cached = NULL;
|
||||
zp->z_xattr_cached = NULL;
|
||||
zp->z_xattr_parent = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -154,6 +157,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
|
||||
ASSERT3P(zp->z_dirlocks, ==, NULL);
|
||||
ASSERT3P(zp->z_acl_cached, ==, NULL);
|
||||
ASSERT3P(zp->z_xattr_cached, ==, NULL);
|
||||
|
||||
ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
|
||||
ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -554,6 +560,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
|
||||
zp->z_blksz = blksz;
|
||||
zp->z_seq = 0x7A4653;
|
||||
zp->z_sync_cnt = 0;
|
||||
zp->z_sync_writes_cnt = 0;
|
||||
zp->z_async_writes_cnt = 0;
|
||||
|
||||
zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
|
||||
|
||||
|
||||
@@ -165,17 +165,56 @@ static int
|
||||
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
|
||||
{
|
||||
struct inode *inode = filp->f_mapping->host;
|
||||
znode_t *zp = ITOZ(inode);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(inode);
|
||||
cred_t *cr = CRED();
|
||||
int error;
|
||||
fstrans_cookie_t cookie;
|
||||
|
||||
/*
|
||||
* The variables z_sync_writes_cnt and z_async_writes_cnt work in
|
||||
* tandem so that sync writes can detect if there are any non-sync
|
||||
* writes going on and vice-versa. The "vice-versa" part to this logic
|
||||
* is located in zfs_putpage() where non-sync writes check if there are
|
||||
* any ongoing sync writes. If any sync and non-sync writes overlap,
|
||||
* we do a commit to complete the non-sync writes since the latter can
|
||||
* potentially take several seconds to complete and thus block sync
|
||||
* writes in the upcoming call to filemap_write_and_wait_range().
|
||||
*/
|
||||
atomic_inc_32(&zp->z_sync_writes_cnt);
|
||||
/*
|
||||
* If the following check does not detect an overlapping non-sync write
|
||||
* (say because it's just about to start), then it is guaranteed that
|
||||
* the non-sync write will detect this sync write. This is because we
|
||||
* always increment z_sync_writes_cnt / z_async_writes_cnt before doing
|
||||
* the check on z_async_writes_cnt / z_sync_writes_cnt here and in
|
||||
* zfs_putpage() respectively.
|
||||
*/
|
||||
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
|
||||
ZPL_ENTER(zfsvfs);
|
||||
zil_commit(zfsvfs->z_log, zp->z_id);
|
||||
ZPL_EXIT(zfsvfs);
|
||||
}
|
||||
|
||||
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
|
||||
|
||||
/*
|
||||
* The sync write is not complete yet but we decrement
|
||||
* z_sync_writes_cnt since zfs_fsync() increments and decrements
|
||||
* it internally. If a non-sync write starts just after the decrement
|
||||
* operation but before we call zfs_fsync(), it may not detect this
|
||||
* overlapping sync write but it does not matter since we have already
|
||||
* gone past filemap_write_and_wait_range() and we won't block due to
|
||||
* the non-sync write.
|
||||
*/
|
||||
atomic_dec_32(&zp->z_sync_writes_cnt);
|
||||
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
error = -zfs_fsync(ITOZ(inode), datasync, cr);
|
||||
error = -zfs_fsync(zp, datasync, cr);
|
||||
spl_fstrans_unmark(cookie);
|
||||
crfree(cr);
|
||||
ASSERT3S(error, <=, 0);
|
||||
@@ -680,14 +719,14 @@ zpl_readahead(struct readahead_control *ractl)
|
||||
static int
|
||||
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
struct address_space *mapping = data;
|
||||
boolean_t *for_sync = data;
|
||||
fstrans_cookie_t cookie;
|
||||
|
||||
ASSERT(PageLocked(pp));
|
||||
ASSERT(!PageWriteback(pp));
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
(void) zfs_putpage(mapping->host, pp, wbc);
|
||||
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
return (0);
|
||||
@@ -714,8 +753,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
* we run it once in non-SYNC mode so that the ZIL gets all the data,
|
||||
* and then we commit it all in one go.
|
||||
*/
|
||||
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
|
||||
wbc->sync_mode = WB_SYNC_NONE;
|
||||
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
|
||||
result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
|
||||
if (sync_mode != wbc->sync_mode) {
|
||||
ZPL_ENTER(zfsvfs);
|
||||
ZPL_VERIFY_ZP(zp);
|
||||
@@ -731,7 +771,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
* details). That being said, this is a no-op in most cases.
|
||||
*/
|
||||
wbc->sync_mode = sync_mode;
|
||||
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
|
||||
result = write_cache_pages(mapping, wbc, zpl_putpage,
|
||||
&for_sync);
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
@@ -748,7 +789,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
|
||||
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
||||
wbc->sync_mode = WB_SYNC_ALL;
|
||||
|
||||
return (zpl_putpage(pp, wbc, pp->mapping));
|
||||
boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
|
||||
|
||||
return (zpl_putpage(pp, wbc, &for_sync));
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user