From 9e5a297de66b968889513a9bc08e61a9b65bf893 Mon Sep 17 00:00:00 2001
From: Shaan Nobee <sniper111@gmail.com>
Date: Wed, 4 May 2022 00:23:26 +0400
Subject: [PATCH] Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs
 simultaneously

Page writebacks with WB_SYNC_NONE can take several seconds to complete
since they wait for the transaction group to close before being
committed. This is usually not a problem since the caller does not
need to wait. However, if we're simultaneously doing a writeback
with WB_SYNC_ALL (e.g via msync), the latter can block for several
seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE
writeback since it needs to wait for the transaction to complete
and the PG_writeback bit to be cleared.

This commit deals with 2 cases:

- No page writeback is active. A WB_SYNC_ALL page writeback starts
  and even completes. But when it's about to check if the PG_writeback
  bit has been cleared, another writeback with WB_SYNC_NONE starts.
  The sync page writeback ends up waiting for the non-sync page
  writeback to complete.

- A page writeback with WB_SYNC_NONE is already active when a
  WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up
  waiting for the WB_SYNC_NONE writeback.

The fix works by carefully keeping track of active sync/non-sync
writebacks and committing when beneficial.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Shaan Nobee <sniper111@gmail.com>
Closes #12662
Closes #12790
---
 configure.ac                                  |  1 +
 include/os/linux/zfs/sys/trace_acl.h          | 15 +++--
 include/os/linux/zfs/sys/zfs_vnops_os.h       |  2 +-
 include/sys/zfs_znode.h                       |  2 +
 module/os/freebsd/zfs/zfs_znode.c             |  8 +++
 module/os/linux/zfs/zfs_ctldir.c              |  2 +
 module/os/linux/zfs/zfs_vnops_os.c            | 52 +++++++++++++--
 module/os/linux/zfs/zfs_znode.c               | 14 ++++-
 module/os/linux/zfs/zpl_file.c                | 55 ++++++++++++++--
 module/zfs/zfs_vnops.c                        |  2 +
 tests/runfiles/common.run                     |  2 +-
 tests/test-runner/bin/zts-report.py.in        |  1 +
 tests/zfs-tests/cmd/Makefile.am               |  1 +
 tests/zfs-tests/cmd/mmap_sync/.gitignore      |  1 +
 tests/zfs-tests/cmd/mmap_sync/Makefile.am     |  6 ++
 .../zfs-tests/cmd/{ => mmap_sync}/mmap_sync.c |  0
 tests/zfs-tests/include/commands.cfg          |  1 +
 .../tests/functional/mmap/Makefile.am         |  3 +-
 .../functional/mmap/mmap_sync_001_pos.ksh     | 63 +++++++++++++++++++
 19 files changed, 208 insertions(+), 23 deletions(-)
 create mode 100644 tests/zfs-tests/cmd/mmap_sync/.gitignore
 create mode 100644 tests/zfs-tests/cmd/mmap_sync/Makefile.am
 rename tests/zfs-tests/cmd/{ => mmap_sync}/mmap_sync.c (100%)
 create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh

diff --git a/configure.ac b/configure.ac
index 2671434af..cb339ccd4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -222,6 +222,7 @@ AC_CONFIG_FILES([
 	tests/zfs-tests/cmd/mmap_exec/Makefile
 	tests/zfs-tests/cmd/mmap_libaio/Makefile
 	tests/zfs-tests/cmd/mmap_seek/Makefile
+	tests/zfs-tests/cmd/mmap_sync/Makefile
 	tests/zfs-tests/cmd/mmapwrite/Makefile
 	tests/zfs-tests/cmd/nvlist_to_lua/Makefile
 	tests/zfs-tests/cmd/randfree_file/Makefile
diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h
index 21bcefa4e..f20abfe4b 100644
--- a/include/os/linux/zfs/sys/trace_acl.h
+++ b/include/os/linux/zfs/sys/trace_acl.h
@@ -58,6 +58,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __field(uint64_t,		z_size)
 	    __field(uint64_t,		z_pflags)
 	    __field(uint32_t,		z_sync_cnt)
+	    __field(uint32_t,		z_sync_writes_cnt)
+	    __field(uint32_t,		z_async_writes_cnt)
 	    __field(mode_t,		z_mode)
 	    __field(boolean_t,		z_is_sa)
 	    __field(boolean_t,		z_is_mapped)
@@ -90,6 +92,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	    __entry->z_size		= zn->z_size;
 	    __entry->z_pflags		= zn->z_pflags;
 	    __entry->z_sync_cnt		= zn->z_sync_cnt;
+	    __entry->z_sync_writes_cnt	= zn->z_sync_writes_cnt;
+	    __entry->z_async_writes_cnt	= zn->z_async_writes_cnt;
 	    __entry->z_mode		= zn->z_mode;
 	    __entry->z_is_sa		= zn->z_is_sa;
 	    __entry->z_is_mapped	= zn->z_is_mapped;
@@ -114,17 +118,18 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
 	TP_printk("zn { id %llu unlinked %u atime_dirty %u "
 	    "zn_prefetch %u blksz %u seq %u "
 	    "mapcnt %llu size %llu pflags %llu "
-	    "sync_cnt %u mode 0x%x is_sa %d "
-	    "is_mapped %d is_ctldir %d inode { "
+	    "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
+	    "mode 0x%x is_sa %d is_mapped %d is_ctldir %d inode { "
 	    "uid %u gid %u ino %lu nlink %u size %lli "
 	    "blkbits %u bytes %u mode 0x%x generation %x } } "
 	    "ace { type %u flags %u access_mask %u } mask_matched %u",
 	    __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
 	    __entry->z_zn_prefetch, __entry->z_blksz,
 	    __entry->z_seq, __entry->z_mapcnt, __entry->z_size,
-	    __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode,
-	    __entry->z_is_sa, __entry->z_is_mapped,
-	    __entry->z_is_ctldir, __entry->i_uid,
+	    __entry->z_pflags, __entry->z_sync_cnt,
+	    __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
+	    __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped,
+	    __entry->z_is_ctldir,  __entry->i_uid,
 	    __entry->i_gid, __entry->i_ino, __entry->i_nlink,
 	    __entry->i_size, __entry->i_blkbits,
 	    __entry->i_bytes, __entry->i_mode, __entry->i_generation,
diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h
index 129bc43a8..f4b78bee6 100644
--- a/include/os/linux/zfs/sys/zfs_vnops_os.h
+++ b/include/os/linux/zfs/sys/zfs_vnops_os.h
@@ -70,7 +70,7 @@ extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
 extern int zfs_fid(struct inode *ip, fid_t *fidp);
 extern int zfs_getpage(struct inode *ip, struct page *pp);
 extern int zfs_putpage(struct inode *ip, struct page *pp,
-    struct writeback_control *wbc);
+    struct writeback_control *wbc, boolean_t for_sync);
 extern int zfs_dirty_inode(struct inode *ip, int flags);
 extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
     size_t len, unsigned long vm_flags);
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index 0df8a0e4b..1a4be9bfd 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -198,6 +198,8 @@ typedef struct znode {
 	uint64_t	z_size;		/* file size (cached) */
 	uint64_t	z_pflags;	/* pflags (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
+	uint32_t	z_sync_writes_cnt; /* synchronous write count */
+	uint32_t	z_async_writes_cnt; /* asynchronous write count */
 	mode_t		z_mode;		/* mode (cached) */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c
index 1debc3ec3..92e3bdd2e 100644
--- a/module/os/freebsd/zfs/zfs_znode.c
+++ b/module/os/freebsd/zfs/zfs_znode.c
@@ -153,6 +153,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
+
 	return (0);
 }
 
@@ -172,6 +175,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+
+	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
+	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 
@@ -457,6 +463,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
 #if __FreeBSD_version >= 1300139
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);
 #endif
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index c45644a69..a60eeaaba 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -478,6 +478,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index c17144808..e45c3e992 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3445,7 +3445,7 @@ top:
 }
 
 static void
-zfs_putpage_commit_cb(void *arg)
+zfs_putpage_sync_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 
@@ -3453,13 +3453,26 @@ zfs_putpage_commit_cb(void *arg)
 	end_page_writeback(pp);
 }
 
+static void
+zfs_putpage_async_commit_cb(void *arg)
+{
+	struct page *pp = arg;
+	znode_t *zp = ITOZ(pp->mapping->host);
+
+	ClearPageError(pp);
+	end_page_writeback(pp);
+	atomic_dec_32(&zp->z_async_writes_cnt);
+}
+
 /*
  * Push a page out to disk, once the page is on stable storage the
  * registered commit callback will be run as notification of completion.
  *
- *	IN:	ip	- page mapped for inode.
- *		pp	- page to push (page is locked)
- *		wbc	- writeback control data
+ *	IN:	ip	 - page mapped for inode.
+ *		pp	 - page to push (page is locked)
+ *		wbc	 - writeback control data
+ *		for_sync - does the caller intend to wait synchronously for the
+ *			   page writeback to complete?
  *
  *	RETURN:	0 if success
  *		error code if failure
@@ -3469,7 +3482,8 @@ zfs_putpage_commit_cb(void *arg)
  */
 /* ARGSUSED */
 int
-zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
+    boolean_t for_sync)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
@@ -3567,6 +3581,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 		zfs_rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE) {
+			/*
+			 * Speed up any non-sync page writebacks since
+			 * they may take several seconds to complete.
+			 * Refer to the comment in zpl_fsync() (when
+			 * HAVE_FSYNC_RANGE is defined) for details.
+			 */
+			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
+				zil_commit(zfsvfs->z_log, zp->z_id);
+			}
+
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
@@ -3592,6 +3616,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
+	if (!for_sync)
+		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);
 
@@ -3613,6 +3639,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
+		if (!for_sync)
+			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (err);
@@ -3637,7 +3665,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
-	    zfs_putpage_commit_cb, pp);
+	    for_sync ? zfs_putpage_sync_commit_cb :
+	    zfs_putpage_async_commit_cb, pp);
+
 	dmu_tx_commit(tx);
 
 	zfs_rangelock_exit(lr);
@@ -3649,6 +3679,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 		 * performance reasons.
 		 */
 		zil_commit(zfsvfs->z_log, zp->z_id);
+	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
+		/*
+		 * If the caller does not intend to wait synchronously
+		 * for this page writeback to complete and there are active
+		 * synchronous calls on this file, do a commit so that
+		 * the latter don't accidentally end up waiting for
+		 * our writeback to complete. Refer to the comment in
+		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
+		 */
+		zil_commit(zfsvfs->z_log, zp->z_id);
 	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c
index f3475b4d9..ec3cc0d1a 100644
--- a/module/os/linux/zfs/zfs_znode.c
+++ b/module/os/linux/zfs/zfs_znode.c
@@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
+
 	return (0);
 }
 
@@ -151,9 +154,12 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	rw_destroy(&zp->z_xattr_lock);
 	zfs_rangelock_fini(&zp->z_rangelock);
 
-	ASSERT(zp->z_dirlocks == NULL);
-	ASSERT(zp->z_acl_cached == NULL);
-	ASSERT(zp->z_xattr_cached == NULL);
+	ASSERT3P(zp->z_dirlocks, ==, NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+
+	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
+	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 static int
@@ -549,6 +555,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
+	zp->z_sync_writes_cnt = 0;
+	zp->z_async_writes_cnt = 0;
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index effd7dae1..b1f252254 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -165,17 +165,56 @@ static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
+	znode_t *zp = ITOZ(inode);
+	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
+	/*
+	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
+	 * tandem so that sync writes can detect if there are any non-sync
+	 * writes going on and vice-versa. The "vice-versa" part to this logic
+	 * is located in zfs_putpage() where non-sync writes check if there are
+	 * any ongoing sync writes. If any sync and non-sync writes overlap,
+	 * we do a commit to complete the non-sync writes since the latter can
+	 * potentially take several seconds to complete and thus block sync
+	 * writes in the upcoming call to filemap_write_and_wait_range().
+	 */
+	atomic_inc_32(&zp->z_sync_writes_cnt);
+	/*
+	 * If the following check does not detect an overlapping non-sync write
+	 * (say because it's just about to start), then it is guaranteed that
+	 * the non-sync write will detect this sync write. This is because we
+	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
+	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
+	 * zfs_putpage() respectively.
+	 */
+	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
+		ZPL_ENTER(zfsvfs);
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZPL_EXIT(zfsvfs);
+	}
+
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+	/*
+	 * The sync write is not complete yet but we decrement
+	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
+	 * it internally. If a non-sync write starts just after the decrement
+	 * operation but before we call zfs_fsync(), it may not detect this
+	 * overlapping sync write but it does not matter since we have already
+	 * gone past filemap_write_and_wait_range() and we won't block due to
+	 * the non-sync write.
+	 */
+	atomic_dec_32(&zp->z_sync_writes_cnt);
+
 	if (error)
 		return (error);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
-	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	error = -zfs_fsync(zp, datasync, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
@@ -675,14 +714,14 @@ zpl_readahead(struct readahead_control *ractl)
 static int
 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
-	struct address_space *mapping = data;
+	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(mapping->host, pp, wbc);
+	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
 	return (0);
@@ -709,8 +748,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
 	 * and then we commit it all in one go.
 	 */
+	boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
 	wbc->sync_mode = WB_SYNC_NONE;
-	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
 	if (sync_mode != wbc->sync_mode) {
 		ZPL_ENTER(zfsvfs);
 		ZPL_VERIFY_ZP(zp);
@@ -726,7 +766,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 		 * details). That being said, this is a no-op in most cases.
 		 */
 		wbc->sync_mode = sync_mode;
-		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+		result = write_cache_pages(mapping, wbc, zpl_putpage,
+		    &for_sync);
 	}
 	return (result);
 }
@@ -743,7 +784,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
 	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		wbc->sync_mode = WB_SYNC_ALL;
 
-	return (zpl_putpage(pp, wbc, pp->mapping));
+	boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+	return (zpl_putpage(pp, wbc, &for_sync));
 }
 
 /*
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index b9498d17e..bf5524cee 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
+		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
+		atomic_dec_32(&zp->z_sync_writes_cnt);
 		ZFS_EXIT(zfsvfs);
 	}
 	tsd_set(zfs_fsyncer_key, NULL);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 07b79fe99..6c2296d4c 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -672,7 +672,7 @@ tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
-    'mmap_write_001_pos']
+    'mmap_write_001_pos', 'mmap_sync_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index d6fd55644..878b30025 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -189,6 +189,7 @@ if sys.platform.startswith('freebsd'):
         'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
         'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
         'link_count/link_count_001': ['SKIP', na_reason],
+        'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
     })
 elif sys.platform.startswith('linux'):
     known.update({
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index d1c29fcd1..7ec4cb619 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -20,6 +20,7 @@ SUBDIRS = \
 	mmap_exec \
 	mmap_libaio \
 	mmap_seek \
+	mmap_sync \
 	mmapwrite \
 	nvlist_to_lua \
 	randwritecomp \
diff --git a/tests/zfs-tests/cmd/mmap_sync/.gitignore b/tests/zfs-tests/cmd/mmap_sync/.gitignore
new file mode 100644
index 000000000..c721f472b
--- /dev/null
+++ b/tests/zfs-tests/cmd/mmap_sync/.gitignore
@@ -0,0 +1 @@
+/mmap_sync
diff --git a/tests/zfs-tests/cmd/mmap_sync/Makefile.am b/tests/zfs-tests/cmd/mmap_sync/Makefile.am
new file mode 100644
index 000000000..313e8db5c
--- /dev/null
+++ b/tests/zfs-tests/cmd/mmap_sync/Makefile.am
@@ -0,0 +1,6 @@
+include $(top_srcdir)/config/Rules.am
+
+pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
+
+pkgexec_PROGRAMS = mmap_sync
+mmap_sync_SOURCES = mmap_sync.c
diff --git a/tests/zfs-tests/cmd/mmap_sync.c b/tests/zfs-tests/cmd/mmap_sync/mmap_sync.c
similarity index 100%
rename from tests/zfs-tests/cmd/mmap_sync.c
rename to tests/zfs-tests/cmd/mmap_sync/mmap_sync.c
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index 78802c9fb..8ac38dfd8 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -207,6 +207,7 @@ export ZFSTEST_FILES='badsend
     mmap_exec
     mmap_libaio
     mmap_seek
+    mmap_sync
     mmapwrite
     nvlist_to_lua
     randfree_file
diff --git a/tests/zfs-tests/tests/functional/mmap/Makefile.am b/tests/zfs-tests/tests/functional/mmap/Makefile.am
index 301007f63..526405954 100644
--- a/tests/zfs-tests/tests/functional/mmap/Makefile.am
+++ b/tests/zfs-tests/tests/functional/mmap/Makefile.am
@@ -6,7 +6,8 @@ dist_pkgdata_SCRIPTS = \
 	mmap_read_001_pos.ksh \
 	mmap_write_001_pos.ksh \
 	mmap_libaio_001_pos.ksh \
-	mmap_seek_001_pos.ksh
+	mmap_seek_001_pos.ksh \
+	mmap_sync_001_pos.ksh
 
 dist_pkgdata_DATA = \
 	mmap.cfg
diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh
new file mode 100755
index 000000000..b764d6607
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/mmap/mmap_sync_001_pos.ksh
@@ -0,0 +1,63 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# msync()s of mmap()'ed file should complete quickly during
+# background dirty page writebacks by the kernel.
+#
+
+function cleanup
+{
+	log_must eval "echo $saved_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs"
+	log_must eval "echo $saved_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio"
+	log_must eval "echo $saved_vm_dirty_writeback_centisecs > /proc/sys/vm/dirty_writeback_centisecs"
+
+	# revert to some sensible defaults if the values we saved
+	# were incorrect due to a previous run being interrupted
+	if [ $(</proc/sys/vm/dirty_expire_centisecs) -eq 1 ]; then
+		log_must eval "echo 3000 > /proc/sys/vm/dirty_expire_centisecs"
+	fi
+
+	if [ $(</proc/sys/vm/dirty_background_ratio) -eq 0 ]; then
+		log_must eval "echo 10 > /proc/sys/vm/dirty_background_ratio"
+	fi
+
+	if [ $(</proc/sys/vm/dirty_writeback_centisecs) -eq 1 ]; then
+		log_must eval "echo 500 > /proc/sys/vm/dirty_writeback_centisecs"
+	fi
+}
+
+if ! is_linux; then
+	log_unsupported "Only supported on Linux, requires /proc/sys/vm/ tunables"
+fi
+
+log_onexit cleanup
+log_assert "Run the tests for mmap_sync"
+
+read -r saved_vm_dirty_expire_centisecs < /proc/sys/vm/dirty_expire_centisecs
+read -r saved_vm_dirty_background_ratio < /proc/sys/vm/dirty_background_ratio
+read -r saved_vm_dirty_writeback_centisecs < /proc/sys/vm/dirty_writeback_centisecs
+
+log_must eval "echo 1 > /proc/sys/vm/dirty_expire_centisecs"
+log_must eval "echo 1 > /proc/sys/vm/dirty_background_bytes"
+log_must eval "echo 1 > /proc/sys/vm/dirty_writeback_centisecs"
+
+log_must mmap_sync
+log_pass "mmap_sync tests passed."