From 3c0e5c0f455576d045fa443cbab74834d70ded55 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 1 Aug 2011 21:28:51 -0700
Subject: [PATCH] Cleanup mmap(2) writes

While the existing implementation of .writepage()/zpl_putpage() was
functional it was not entirely correct.  In particular, it would move
dirty pages in to a clean state simply after copying them in to the
ARC cache.  This would result in the pages being lost if the system
were to crash enough though the Linux VFS believed them to be safe on
stable storage.

Since at the moment virtually all I/O, except mmap(2), bypasses the
page cache this isn't as bad as it sounds.  However, as hopefully
start using the page cache more getting this right becomes more
important so it's good to improve this now.

This patch takes a big step in that direction by updating the code
to correctly move dirty pages through a writeback phase before they
are marked clean.  When a dirty page is copied in to the ARC it will
now be set in writeback and a completion callback is registered with
the transaction.  The page will stay in writeback until the dmu runs
the completion callback indicating the page is on stable storage.
At this point the page can be safely marked clean.

This process is normally entirely asynchronous and will be repeated
for every dirty page.  This may initially sound inefficient but most
of these pages will end up in a few txgs.  That means when they are
eventually written to disk they should be nicely batched.  However,
there is room for improvement.  It may still be desirable to batch
up the pages in to larger writes for the dmu.  This would reduce
the number of callbacks and small 4k buffer required by the ARC.

Finally, if the caller requires that the I/O be done synchronously
by setting WB_SYNC_ALL or if ZFS_SYNC_ALWAYS is set.  Then the I/O
will trigger a zil_commit() to flush the data to stable storage.
At which point the registered callbacks will be run leaving the
date safe of disk and marked clean before returning from .writepage.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/zfs_vnops.h |   4 +-
 module/zfs/zfs_vnops.c  | 193 +++++++++++++++++++---------------------
 module/zfs/zpl_file.c   |  19 ++--
 3 files changed, 98 insertions(+), 118 deletions(-)

diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h
index d73fe2f3e..dd25fbcbc 100644
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -73,8 +73,8 @@ extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag,
 extern int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag,
     cred_t *cr);
 extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
-extern int zfs_putpage(struct page *page, struct writeback_control *wbc,
-    void *data);
+extern int zfs_putpage(struct inode *ip, struct page *pp,
+    struct writeback_control *wbc);
 extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
     size_t len, unsigned long vm_flags);
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 262c1ed64..3331a1706 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -3735,136 +3735,123 @@ top:
 }
 EXPORT_SYMBOL(zfs_link);
 
-/*
- * Push a page out to disk
- *
- *	IN:	vp	- file to push page to.
- *		pp	- page to push.
- *		off	- start of range pushed.
- *		len	- len of range pushed.
- *
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * NOTE: callers must have locked the page to be pushed.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(struct inode *ip, struct page *pp, u_offset_t off, size_t len)
+static void
+zfs_putpage_commit_cb(void *arg, int error)
 {
-	znode_t    *zp  = ITOZ(ip);
-	zfs_sb_t   *zsb = ITOZSB(ip);
-	dmu_tx_t   *tx;
-	caddr_t	   va;
-	int        err;
+	struct page *pp = arg;
 
-	/*
-	 * Can't push pages past end-of-file.
-	 */
-	if (off >= zp->z_size) {
-		/* ignore all pages */
-		err = 0;
-		goto out;
-	} else if (off + len > zp->z_size)
-		len = zp->z_size - off;
+	if (error) {
+		__set_page_dirty_nobuffers(pp);
 
-	if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
-	    zfs_owner_overquota(zsb, zp, B_TRUE)) {
-		err = EDQUOT;
-		goto out;
-	}
-top:
-	tx = dmu_tx_create(zsb->z_os);
-	dmu_tx_hold_write(tx, zp->z_id, off, len);
-
-	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
-	zfs_sa_upgrade_txholds(tx, zp);
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
-	if (err != 0) {
-		if (err == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		goto out;
+		if (error != ECANCELED)
+			SetPageError(pp);
+	} else {
+		ClearPageError(pp);
 	}
 
-	va = kmap(pp);
-	ASSERT3U(len, <=, PAGESIZE);
-	dmu_write(zsb->z_os, zp->z_id, off, len, va, tx);
-	kunmap(pp);
-
-	if (err == 0) {
-		uint64_t mtime[2], ctime[2];
-		sa_bulk_attr_t bulk[3];
-		int count = 0;
-
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL,
-		    &mtime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
-		    &ctime, 16);
-		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
-		    &zp->z_pflags, 8);
-		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
-		    B_TRUE);
-		zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, off, len, 0);
-	}
-	dmu_tx_commit(tx);
-
-out:
-	return (err);
+	end_page_writeback(pp);
 }
 
 /*
- * Copy the portion of the file indicated from page into the file.
+ * Push a page out to disk, once the page is on stable storage the
+ * registered commit callback will be run as notification of completion.
  *
- *	IN:	ip	- inode of file to push page data to.
- *		wbc	- Unused parameter
- *		data	- pointer to address_space
+ *	IN:	ip	- page mapped for inode.
+ *		pp	- page to push (page is locked)
+ *		wbc	- writeback control data
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
- *	vp - ctime|mtime updated
+ *	ip - ctime|mtime updated
  */
-/*ARGSUSED*/
+/* ARGSUSED */
 int
-zfs_putpage(struct page *page, struct writeback_control *wbc, void *data)
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
 {
-	struct address_space *mapping = data;
-	struct inode         *ip      = mapping->host;
-	znode_t              *zp      = ITOZ(ip);
-	zfs_sb_t             *zsb     = ITOZSB(ip);
-	u_offset_t	     io_off;
-	size_t		     io_len;
-	size_t		     len;
-	int		     error;
+	znode_t		*zp = ITOZ(ip);
+	zfs_sb_t	*zsb = ITOZSB(ip);
+	loff_t		offset;
+	loff_t		pgoff;
+        unsigned int	pglen;
+	dmu_tx_t	*tx;
+	caddr_t		va;
+	int		err = 0;
+	uint64_t	mtime[2], ctime[2];
+	sa_bulk_attr_t	bulk[3];
+	int		cnt = 0;
 
-	io_off = page_offset(page);
-	io_len = PAGESIZE;
 
-	ZFS_ENTER(zsb);
-	ZFS_VERIFY_ZP(zp);
+	ASSERT(PageLocked(pp));
 
-	if (io_off > zp->z_size) {
-		/* past end of file */
-		ZFS_EXIT(zsb);
+	pgoff = page_offset(pp);     /* Page byte-offset in file */
+	offset = i_size_read(ip);    /* File length in bytes */
+	pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */
+	    P2ROUNDUP(offset, PAGE_CACHE_SIZE)-pgoff);
+
+	/* Page is beyond end of file */
+	if (pgoff >= offset) {
+		unlock_page(pp);
 		return (0);
 	}
 
-	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
+	/* Truncate page length to end of file */
+	if (pgoff + pglen > offset)
+		pglen = offset - pgoff;
 
-	error = zfs_putapage(ip, page, io_off, len);
+#if 0
+	/*
+	 * FIXME: Allow mmap writes past its quota.  The correct fix
+	 * is to register a page_mkwrite() handler to count the page
+	 * against its quota when it is about to be dirtied.
+	 */
+	if (zfs_owner_overquota(zsb, zp, B_FALSE) ||
+	    zfs_owner_overquota(zsb, zp, B_TRUE)) {
+		err = EDQUOT;
+	}
+#endif
 
-	if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS)
+	set_page_writeback(pp);
+	unlock_page(pp);
+
+	tx = dmu_tx_create(zsb->z_os);
+
+	dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp);
+
+	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (err != 0) {
+		if (err == ERESTART)
+			dmu_tx_wait(tx);
+
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	va = kmap(pp);
+	ASSERT3U(pglen, <=, PAGE_CACHE_SIZE);
+	dmu_write(zsb->z_os, zp->z_id, pgoff, pglen, va, tx);
+	kunmap(pp);
+
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+	zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0);
+
+	dmu_tx_commit(tx);
+	ASSERT3S(err, ==, 0);
+
+	if ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ||
+	    (wbc->sync_mode == WB_SYNC_ALL))
 		zil_commit(zsb->z_log, zp->z_id);
-	ZFS_EXIT(zsb);
-	return (error);
+
+	return (err);
 }
-EXPORT_SYMBOL(zfs_putpage);
 
 /*ARGSUSED*/
 void
diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c
index c2e3a6bdc..7eaf65c6e 100644
--- a/module/zfs/zpl_file.c
+++ b/module/zfs/zpl_file.c
@@ -352,7 +352,10 @@ zpl_readpage(struct file *filp, struct page *pp)
 int
 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
-	int error;
+	struct address_space *mapping = data;
+
+	ASSERT(PageLocked(pp));
+	ASSERT(!PageWriteback(pp));
 
 	/*
 	 * Disable the normal reclaim path for zpl_putpage().  This
@@ -362,20 +365,10 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 	 * zpl_putpage() again resulting in a deadlock.
 	 */
 	current->flags |= PF_MEMALLOC;
-	error = -zfs_putpage(pp, wbc, data);
+	(void) zfs_putpage(mapping->host, pp, wbc);
 	current->flags &= ~PF_MEMALLOC;
 
-	if (error) {
-		SetPageError(pp);
-		ClearPageUptodate(pp);
-	} else {
-		ClearPageError(pp);
-		SetPageUptodate(pp);
-		flush_dcache_page(pp);
-	}
-
-	unlock_page(pp);
-	return error;
+	return (0);
 }
 
 static int