Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs simultaneously

Page writebacks with WB_SYNC_NONE can take several seconds to complete since they wait for the transaction group to close before being committed. This is usually not a problem since the caller does not need to wait. However, if we're simultaneously doing a writeback with WB_SYNC_ALL (e.g via msync), the latter can block for several seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE writeback since it needs to wait for the transaction to complete and the PG_writeback bit to be cleared. This commit deals with 2 cases: - No page writeback is active. A WB_SYNC_ALL page writeback starts and even completes. But when it's about to check if the PG_writeback bit has been cleared, another writeback with WB_SYNC_NONE starts. The sync page writeback ends up waiting for the non-sync page writeback to complete. - A page writeback with WB_SYNC_NONE is already active when a WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up waiting for the WB_SYNC_NONE writeback. The fix works by carefully keeping track of active sync/non-sync writebacks and committing when beneficial. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Shaan Nobee <sniper111@gmail.com> Closes #12662 Closes #12790
2025-10-16 05:14:59 +03:00 · 2022-05-04 00:23:26 +04:00 · 2022-05-04 00:23:26 +04:00 · 411f4a018d
commit 411f4a018d
parent a64d757aa4
17 changed files with 351 additions and 20 deletions
--- a/include/os/linux/zfs/sys/trace_acl.h
+++ b/include/os/linux/zfs/sys/trace_acl.h
@ -58,6 +58,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __field(uint64_t,		z_size)
 	    __field(uint64_t,		z_pflags)
 	    __field(uint32_t,		z_sync_cnt)
+	    __field(uint32_t,		z_sync_writes_cnt)
+	    __field(uint32_t,		z_async_writes_cnt)
 	    __field(mode_t,		z_mode)
 	    __field(boolean_t,		z_is_sa)
 	    __field(boolean_t,		z_is_mapped)
@ -91,6 +93,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_size		= zn->z_size;
 	    __entry->z_pflags		= zn->z_pflags;
 	    __entry->z_sync_cnt		= zn->z_sync_cnt;
+	    __entry->z_sync_writes_cnt	= zn->z_sync_writes_cnt;
+	    __entry->z_async_writes_cnt	= zn->z_async_writes_cnt;
 	    __entry->z_mode		= zn->z_mode;
 	    __entry->z_is_sa		= zn->z_is_sa;
 	    __entry->z_is_mapped	= zn->z_is_mapped;
@ -116,16 +120,18 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	TP_printk("zn { id %llu unlinked %u atime_dirty %u "
 	    "zn_prefetch %u blksz %u seq %u "
 	    "mapcnt %llu size %llu pflags %llu "
-	    "sync_cnt %u mode 0x%x is_sa %d "
-	    "is_mapped %d is_ctldir %d is_stale %d inode { "
+	    "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
+	    "mode 0x%x is_sa %d is_mapped %d "
+	    "is_ctldir %d is_stale %d inode { "
 	    "uid %u gid %u ino %lu nlink %u size %lli "
 	    "blkbits %u bytes %u mode 0x%x generation %x } } "
 	    "ace { type %u flags %u access_mask %u } mask_matched %u",
 	    __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
 	    __entry->z_zn_prefetch, __entry->z_blksz,
 	    __entry->z_seq, __entry->z_mapcnt, __entry->z_size,
-	    __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode,
-	    __entry->z_is_sa, __entry->z_is_mapped,
+	    __entry->z_pflags, __entry->z_sync_cnt,
+	    __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
+	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped,
 	    __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid,
 	    __entry->i_gid, __entry->i_ino, __entry->i_nlink,
 	    __entry->i_size, __entry->i_blkbits,
--- a/include/os/linux/zfs/sys/zfs_vnops_os.h
+++ b/include/os/linux/zfs/sys/zfs_vnops_os.h
@ -70,7 +70,7 @@ extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
 extern int zfs_fid(struct inode *ip, fid_t *fidp);
 extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
 extern int zfs_putpage(struct inode *ip, struct page *pp,
-    struct writeback_control *wbc);
+    struct writeback_control *wbc, boolean_t for_sync);
 extern int zfs_dirty_inode(struct inode *ip, int flags);
 extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
    size_t len, unsigned long vm_flags);
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@ -199,6 +199,8 @@ typedef struct znode {
 	uint64_t	z_size;		/* file size (cached) */
 	uint64_t	z_pflags;	/* pflags (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
+	uint32_t	z_sync_writes_cnt; /* synchronous write count */
+	uint32_t	z_async_writes_cnt; /* asynchronous write count */
 	mode_t		z_mode;		/* mode (cached) */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
--- a/module/os/freebsd/zfs/zfs_znode.c
+++ b/module/os/freebsd/zfs/zfs_znode.c
@ -153,6 +153,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
+
 	return (0);
 }

@ -172,6 +175,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)

 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+
+	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
+	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }


@ -453,6 +459,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
 #if __FreeBSD_version >= 1300139
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);
 #endif
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@ -3396,7 +3396,7 @@ top:
 }

 static void
-zfs_putpage_commit_cb(void *arg)
+zfs_putpage_sync_commit_cb(void *arg)
 {
 	struct page *pp = arg;

@ -3404,13 +3404,26 @@ zfs_putpage_commit_cb(void *arg)
 	end_page_writeback(pp);
 }

+static void
+zfs_putpage_async_commit_cb(void *arg)
+{
+	struct page *pp = arg;
+	znode_t *zp = ITOZ(pp->mapping->host);
+
+	ClearPageError(pp);
+	end_page_writeback(pp);
+	atomic_dec_32(&zp->z_async_writes_cnt);
+}
+
 /*
 * Push a page out to disk, once the page is on stable storage the
 * registered commit callback will be run as notification of completion.
 *
- *	IN:	ip	- page mapped for inode.
- *		pp	- page to push (page is locked)
- *		wbc	- writeback control data
+ *	IN:	ip	 - page mapped for inode.
+ *		pp	 - page to push (page is locked)
+ *		wbc	 - writeback control data
+ *		for_sync - does the caller intend to wait synchronously for the
+ *			   page writeback to complete?
 *
 *	RETURN:	0 if success
 *		error code if failure
@ -3419,7 +3432,8 @@ zfs_putpage_commit_cb(void *arg)
 *	ip - ctime|mtime updated
 */
 int
-zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
+    boolean_t for_sync)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
@ -3517,6 +3531,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 		zfs_rangelock_exit(lr);

 		if (wbc->sync_mode != WB_SYNC_NONE) {
+			/*
+			 * Speed up any non-sync page writebacks since
+			 * they may take several seconds to complete.
+			 * Refer to the comment in zpl_fsync() (when
+			 * HAVE_FSYNC_RANGE is defined) for details.
+			 */
+			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
+				zil_commit(zfsvfs->z_log, zp->z_id);
+			}
+
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
@ -3542,6 +3566,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
+	if (!for_sync)
+		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);

@ -3563,6 +3589,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
+		if (!for_sync)
+			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (err);
@ -3587,7 +3615,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);

 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
-	    zfs_putpage_commit_cb, pp);
+	    for_sync ? zfs_putpage_sync_commit_cb :
+	    zfs_putpage_async_commit_cb, pp);
+
 	dmu_tx_commit(tx);

 	zfs_rangelock_exit(lr);
@ -3599,6 +3629,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 		 * performance reasons.
 		 */
 		zil_commit(zfsvfs->z_log, zp->z_id);
+	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+		/*
+		 * If the caller does not intend to wait synchronously
+		 * for this page writeback to complete and there are active
+		 * synchronous calls on this file, do a commit so that
+		 * the latter don't accidentally end up waiting for
+		 * our writeback to complete. Refer to the comment in
+		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+		 */
+		zil_commit(zfsvfs->z_log, zp->z_id);
 	}

 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
+
 	return (0);
 }

@ -154,6 +157,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+
+	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
+	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }

 static int
@ -554,6 +560,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;

 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);

--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@ -165,17 +165,56 @@ static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
+	znode_t *zp = ITOZ(inode);
+	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;

+	/*
+	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
+	 * tandem so that sync writes can detect if there are any non-sync
+	 * writes going on and vice-versa. The "vice-versa" part to this logic
+	 * is located in zfs_putpage() where non-sync writes check if there are
+	 * any ongoing sync writes. If any sync and non-sync writes overlap,
+	 * we do a commit to complete the non-sync writes since the latter can
+	 * potentially take several seconds to complete and thus block sync
+	 * writes in the upcoming call to filemap_write_and_wait_range().
+	 */
+	atomic_inc_32(&zp->z_sync_writes_cnt);
+	/*
+	 * If the following check does not detect an overlapping non-sync write
+	 * (say because it's just about to start), then it is guaranteed that
+	 * the non-sync write will detect this sync write. This is because we
+	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
+	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
+	 * zfs_putpage() respectively.
+	 */
+	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
+		ZPL_ENTER(zfsvfs);
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZPL_EXIT(zfsvfs);
+	}
+
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+	/*
+	 * The sync write is not complete yet but we decrement
+	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
+	 * it internally. If a non-sync write starts just after the decrement
+	 * operation but before we call zfs_fsync(), it may not detect this
+	 * overlapping sync write but it does not matter since we have already
+	 * gone past filemap_write_and_wait_range() and we won't block due to
+	 * the non-sync write.
+	 */
+	atomic_dec_32(&zp->z_sync_writes_cnt);
+
 	if (error)
 		return (error);

 	crhold(cr);
 	cookie = spl_fstrans_mark();
-	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	error = -zfs_fsync(zp, datasync, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@ -680,14 +719,14 @@ zpl_readahead(struct readahead_control *ractl)
 static int
 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
-	struct address_space *mapping = data;
+	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;

 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));

 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(mapping->host, pp, wbc);
+	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);

 	return (0);
@ -714,8 +753,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
 	 * and then we commit it all in one go.
 	 */
+	boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
 	wbc->sync_mode = WB_SYNC_NONE;
-	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
 	if (sync_mode != wbc->sync_mode) {
 		ZPL_ENTER(zfsvfs);
 		ZPL_VERIFY_ZP(zp);
@ -731,7 +771,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 		 * details). That being said, this is a no-op in most cases.
 		 */
 		wbc->sync_mode = sync_mode;
-		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+		result = write_cache_pages(mapping, wbc, zpl_putpage,
+		    &for_sync);
 	}
 	return (result);
 }
@ -748,7 +789,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
 	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		wbc->sync_mode = WB_SYNC_ALL;

-	return (zpl_putpage(pp, wbc, pp->mapping));
+	boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+	return (zpl_putpage(pp, wbc, &for_sync));
 }

 /*
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
+		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
+		atomic_dec_32(&zp->z_sync_writes_cnt);
 		ZFS_EXIT(zfsvfs);
 	}
 	tsd_set(zfs_fsyncer_key, NULL);
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@ -681,7 +681,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
 tags = ['functional', 'migration']

 [tests/functional/mmap]
-tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
+tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
 tags = ['functional', 'mmap']

 [tests/functional/mount]
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@ -167,6 +167,7 @@ if sys.platform.startswith('freebsd'):
        'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
        'link_count/link_count_001': ['SKIP', na_reason],
        'casenorm/mixed_create_failure': ['FAIL', 13215],
+        'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
    })
 elif sys.platform.startswith('linux'):
    known.update({
--- a/tests/zfs-tests/cmd/.gitignore
+++ b/tests/zfs-tests/cmd/.gitignore
@ -18,6 +18,7 @@
 /mmap_exec
 /mmap_libaio
 /mmap_seek
+/mmap_sync
 /mmapwrite
 /nvlist_to_lua
 /randfree_file
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@ -80,9 +80,10 @@ mkfiles_SOURCES = mkfiles.c
 mktree_SOURCES = mktree.c


-pkgexec_PROGRAMS += mmap_exec mmap_seek mmapwrite readmmap
+pkgexec_PROGRAMS += mmap_exec mmap_seek mmap_sync mmapwrite readmmap
 mmap_exec_SOURCES = mmap_exec.c
 mmap_seek_SOURCES = mmap_seek.c
+mmap_sync_SOURCES = mmap_sync.c
 mmapwrite_SOURCES = mmapwrite.c
 mmapwrite_LDADD = -lpthread
 readmmap_SOURCES = readmmap.c
--- a/tests/zfs-tests/cmd/mmap_sync.c
+++ b/tests/zfs-tests/cmd/mmap_sync.c
@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+
+static void
+cleanup(char *file)
+{
+	remove(file);
+}
+
+int
+main(int argc, char *argv[])
+{
+	char *testdir = getenv("TESTDIR");
+	if (!testdir) {
+		fprintf(stderr, "environment variable TESTDIR not set\n");
+		return (1);
+	}
+
+	struct stat st;
+	umask(0);
+	if (stat(testdir, &st) != 0 &&
+	    mkdir(testdir, 0777) != 0) {
+		perror("mkdir");
+		return (1);
+	}
+
+	if (argc > 3) {
+		fprintf(stderr, "usage: %s "
+		    "[run time in mins] "
+		    "[max msync time in ms]\n", argv[0]);
+		return (1);
+	}
+
+	int run_time_mins = 5;
+	if (argc >= 2) {
+		run_time_mins = atoi(argv[1]);
+	}
+
+	int max_msync_time_ms = 1000;
+	if (argc >= 3) {
+		max_msync_time_ms = atoi(argv[2]);
+	}
+
+	char filepath[512];
+	filepath[0] = '\0';
+	char *file = &filepath[0];
+
+	strcat(file, testdir);
+	strcat(file, "/msync_file");
+
+	const int LEN = 8;
+	cleanup(file);
+
+	int fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR |
+	    S_IRGRP | S_IROTH);
+
+	if (fd == -1) {
+		(void) fprintf(stderr, "%s: %s: ", argv[0], file);
+		perror("open");
+		return (1);
+	}
+
+	if (ftruncate(fd, LEN) != 0) {
+		perror("ftruncate");
+		cleanup(file);
+		return (1);
+	}
+
+	void *ptr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+	if (ptr == MAP_FAILED) {
+		perror("mmap");
+		cleanup(file);
+		return (1);
+	}
+
+	struct timeval tstart;
+	gettimeofday(&tstart, NULL);
+
+	long long x = 0LL;
+
+	for (;;) {
+		*((long long *)ptr) = x;
+		x++;
+
+		struct timeval t1, t2;
+		gettimeofday(&t1, NULL);
+		if (msync(ptr, LEN, MS_SYNC|MS_INVALIDATE) != 0) {
+			perror("msync");
+			cleanup(file);
+			return (1);
+		}
+
+		gettimeofday(&t2, NULL);
+
+		double elapsed = (t2.tv_sec - t1.tv_sec) * 1000.0;
+		elapsed += ((t2.tv_usec - t1.tv_usec) / 1000.0);
+		if (elapsed > max_msync_time_ms) {
+			fprintf(stderr, "slow msync: %f ms\n", elapsed);
+			munmap(ptr, LEN);
+			cleanup(file);
+			return (1);
+		}
+
+		double elapsed_start = (t2.tv_sec - tstart.tv_sec) * 1000.0;
+		elapsed_start += ((t2.tv_usec - tstart.tv_usec) / 1000.0);
+		if (elapsed_start > run_time_mins * 60 * 1000) {
+			break;
+		}
+	}
+
+	if (munmap(ptr, LEN) != 0) {
+		perror("munmap");
+		cleanup(file);
+		return (1);
+	}
+
+	if (close(fd) != 0) {
+		perror("close");
+	}
+
+	cleanup(file);
+	return (0);
+}
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@ -193,6 +193,7 @@ export ZFSTEST_FILES='badsend
    mmap_exec
    mmap_libaio
    mmap_seek
+    mmap_sync
    mmapwrite
    nvlist_to_lua
    randfree_file
--- a/tests/zfs-tests/tests/functional/mmap/Makefile.am
+++ b/tests/zfs-tests/tests/functional/mmap/Makefile.am
@ -5,7 +5,8 @@ dist_pkgdata_SCRIPTS = \
 	mmap_read_001_pos.ksh \
 	mmap_write_001_pos.ksh \
 	mmap_libaio_001_pos.ksh \
-	mmap_seek_001_pos.ksh
+	mmap_seek_001_pos.ksh \
+	mmap_sync_001_pos.ksh

 dist_pkgdata_DATA = \
 	mmap.cfg
--- a/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh
@ -0,0 +1,63 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# msync()s of mmap()'ed file should complete quickly during
+# background dirty page writebacks by the kernel.
+#
+
+function cleanup
+{
+	log_must eval "echo $saved_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs"
+	log_must eval "echo $saved_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio"
+	log_must eval "echo $saved_vm_dirty_writeback_centisecs > /proc/sys/vm/dirty_writeback_centisecs"
+
+	# revert to some sensible defaults if the values we saved
+	# were incorrect due to a previous run being interrupted
+	if [ $(</proc/sys/vm/dirty_expire_centisecs) -eq 1 ]; then
+		log_must eval "echo 3000 > /proc/sys/vm/dirty_expire_centisecs"
+	fi
+
+	if [ $(</proc/sys/vm/dirty_background_ratio) -eq 0 ]; then
+		log_must eval "echo 10 > /proc/sys/vm/dirty_background_ratio"
+	fi
+
+	if [ $(</proc/sys/vm/dirty_writeback_centisecs) -eq 1 ]; then
+		log_must eval "echo 500 > /proc/sys/vm/dirty_writeback_centisecs"
+	fi
+}
+
+if ! is_linux; then
+	log_unsupported "Only supported on Linux, requires /proc/sys/vm/ tunables"
+fi
+
+log_onexit cleanup
+log_assert "Run the tests for mmap_sync"
+
+read -r saved_vm_dirty_expire_centisecs < /proc/sys/vm/dirty_expire_centisecs
+read -r saved_vm_dirty_background_ratio < /proc/sys/vm/dirty_background_ratio
+read -r saved_vm_dirty_writeback_centisecs < /proc/sys/vm/dirty_writeback_centisecs
+
+log_must eval "echo 1 > /proc/sys/vm/dirty_expire_centisecs"
+log_must eval "echo 1 > /proc/sys/vm/dirty_background_bytes"
+log_must eval "echo 1 > /proc/sys/vm/dirty_writeback_centisecs"
+
+log_must mmap_sync
+log_pass "mmap_sync tests passed."