From 1eacf2b3b0a1d3bccaab83dece44d671ba30292d Mon Sep 17 00:00:00 2001
From: John Gallagher <john.gallagher@delphix.com>
Date: Sat, 4 May 2019 16:39:10 -0700
Subject: [PATCH] Improve rate at which new zvols are processed

The kernel function which adds new zvols as disks to the system,
add_disk(), briefly opens and closes the zvol as part of its work.
Closing a zvol involves waiting for two txgs to sync. This, combined
with the fact that the taskq processing new zvols is single threaded,
makes this processing new zvols slow.

Waiting for these txgs to sync is only necessary if the zvol has been
written to, which is not the case during add_disk(). This change adds
tracking of whether a zvol has been written to so that we can skip the
txg_wait_synced() calls when they are unnecessary.

This change also fixes the flags passed to blkdev_get_by_path() by
vdev_disk_open() to be FMODE_READ | FMODE_WRITE | FMODE_EXCL instead of
just FMODE_EXCL. The flags were being incorrectly calculated because
we were using the wrong version of vdev_bdev_mode().

Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #8526
Closes #8615
---
 module/zfs/vdev_disk.c |   4 +-
 module/zfs/zvol.c      | 258 +++++++++++++++++++++++------------------
 2 files changed, 146 insertions(+), 116 deletions(-)

diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index b329ef3c2..1419ae6ad 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -23,7 +23,7 @@
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -56,7 +56,7 @@ typedef struct dio_request {
 } dio_request_t;
 
 
-#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
+#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
 static fmode_t
 vdev_bdev_mode(int smode)
 {
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 9fd689fbd..a77339d7f 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -36,7 +36,7 @@
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 /*
@@ -155,6 +155,11 @@ typedef struct {
 } zvol_task_t;
 
 #define	ZVOL_RDONLY	0x1
+/*
+ * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which
+ * specifies whether or not the zvol _can_ be written to)
+ */
+#define	ZVOL_WRITTEN_TO	0x2
 
 static uint64_t
 zvol_name_hash(const char *name)
@@ -742,6 +747,7 @@ zvol_write(void *arg)
 
 	zvol_state_t *zv = zvr->zv;
 	ASSERT(zv && zv->zv_open_count > 0);
+	ASSERT(zv->zv_zilog != NULL);
 
 	ssize_t start_resid = uio.uio_resid;
 	unsigned long start_jif = jiffies;
@@ -832,6 +838,7 @@ zvol_discard(void *arg)
 	unsigned long start_jif;
 
 	ASSERT(zv && zv->zv_open_count > 0);
+	ASSERT(zv->zv_zilog != NULL);
 
 	start_jif = jiffies;
 	blk_generic_start_io_acct(zv->zv_queue, WRITE, bio_sectors(bio),
@@ -930,116 +937,6 @@ zvol_read(void *arg)
 	kmem_free(zvr, sizeof (zv_request_t));
 }
 
-static MAKE_REQUEST_FN_RET
-zvol_request(struct request_queue *q, struct bio *bio)
-{
-	zvol_state_t *zv = q->queuedata;
-	fstrans_cookie_t cookie = spl_fstrans_mark();
-	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
-	uint64_t size = BIO_BI_SIZE(bio);
-	int rw = bio_data_dir(bio);
-	zv_request_t *zvr;
-
-	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
-		printk(KERN_INFO
-		    "%s: bad access: offset=%llu, size=%lu\n",
-		    zv->zv_disk->disk_name,
-		    (long long unsigned)offset,
-		    (long unsigned)size);
-
-		BIO_END_IO(bio, -SET_ERROR(EIO));
-		goto out;
-	}
-
-	if (rw == WRITE) {
-		boolean_t need_sync = B_FALSE;
-
-		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
-			BIO_END_IO(bio, -SET_ERROR(EROFS));
-			goto out;
-		}
-
-		/*
-		 * To be released in the I/O function. See the comment on
-		 * rangelock_enter() below.
-		 */
-		rw_enter(&zv->zv_suspend_lock, RW_READER);
-
-		/* bio marked as FLUSH need to flush before write */
-		if (bio_is_flush(bio))
-			zil_commit(zv->zv_zilog, ZVOL_OBJ);
-
-		/* Some requests are just for flush and nothing else. */
-		if (size == 0) {
-			rw_exit(&zv->zv_suspend_lock);
-			BIO_END_IO(bio, 0);
-			goto out;
-		}
-
-		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
-		zvr->zv = zv;
-		zvr->bio = bio;
-
-		/*
-		 * To be released in the I/O function. Since the I/O functions
-		 * are asynchronous, we take it here synchronously to make
-		 * sure overlapped I/Os are properly ordered.
-		 */
-		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-		    RL_WRITER);
-		/*
-		 * Sync writes and discards execute zil_commit() which may need
-		 * to take a RL_READER lock on the whole block being modified
-		 * via its zillog->zl_get_data(): to avoid circular dependency
-		 * issues with taskq threads execute these requests
-		 * synchronously here in zvol_request().
-		 */
-		need_sync = bio_is_fua(bio) ||
-		    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
-		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
-			if (zvol_request_sync || need_sync ||
-			    taskq_dispatch(zvol_taskq, zvol_discard, zvr,
-			    TQ_SLEEP) == TASKQID_INVALID)
-				zvol_discard(zvr);
-		} else {
-			if (zvol_request_sync || need_sync ||
-			    taskq_dispatch(zvol_taskq, zvol_write, zvr,
-			    TQ_SLEEP) == TASKQID_INVALID)
-				zvol_write(zvr);
-		}
-	} else {
-		/*
-		 * The SCST driver, and possibly others, may issue READ I/Os
-		 * with a length of zero bytes.  These empty I/Os contain no
-		 * data and require no additional handling.
-		 */
-		if (size == 0) {
-			BIO_END_IO(bio, 0);
-			goto out;
-		}
-
-		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
-		zvr->zv = zv;
-		zvr->bio = bio;
-
-		rw_enter(&zv->zv_suspend_lock, RW_READER);
-
-		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
-		    RL_READER);
-		if (zvol_request_sync || taskq_dispatch(zvol_taskq,
-		    zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
-			zvol_read(zvr);
-	}
-
-out:
-	spl_fstrans_unmark(cookie);
-#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
-	return (0);
-#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
-	return (BLK_QC_T_NONE);
-#endif
-}
-
 /* ARGSUSED */
 static void
 zvol_get_done(zgd_t *zgd, int error)
@@ -1120,6 +1017,133 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 	return (SET_ERROR(error));
 }
 
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+{
+	zvol_state_t *zv = q->queuedata;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	int rw = bio_data_dir(bio);
+	zv_request_t *zvr;
+
+	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
+		printk(KERN_INFO
+		    "%s: bad access: offset=%llu, size=%lu\n",
+		    zv->zv_disk->disk_name,
+		    (long long unsigned)offset,
+		    (long unsigned)size);
+
+		BIO_END_IO(bio, -SET_ERROR(EIO));
+		goto out;
+	}
+
+	if (rw == WRITE) {
+		boolean_t need_sync = B_FALSE;
+
+		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+			BIO_END_IO(bio, -SET_ERROR(EROFS));
+			goto out;
+		}
+
+		/*
+		 * To be released in the I/O function. See the comment on
+		 * rangelock_enter() below.
+		 */
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		/*
+		 * Open a ZIL if this is the first time we have written to this
+		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+		 * than zv_state_lock so that we don't need to acquire an
+		 * additional lock in this path.
+		 */
+		if (zv->zv_zilog == NULL) {
+			rw_exit(&zv->zv_suspend_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+			if (zv->zv_zilog == NULL) {
+				zv->zv_zilog = zil_open(zv->zv_objset,
+				    zvol_get_data);
+				zv->zv_flags |= ZVOL_WRITTEN_TO;
+			}
+			rw_downgrade(&zv->zv_suspend_lock);
+		}
+
+		/* bio marked as FLUSH need to flush before write */
+		if (bio_is_flush(bio))
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+		/* Some requests are just for flush and nothing else. */
+		if (size == 0) {
+			rw_exit(&zv->zv_suspend_lock);
+			BIO_END_IO(bio, 0);
+			goto out;
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+
+		/*
+		 * To be released in the I/O function. Since the I/O functions
+		 * are asynchronous, we take it here synchronously to make
+		 * sure overlapped I/Os are properly ordered.
+		 */
+		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+		    RL_WRITER);
+		/*
+		 * Sync writes and discards execute zil_commit() which may need
+		 * to take a RL_READER lock on the whole block being modified
+		 * via its zillog->zl_get_data(): to avoid circular dependency
+		 * issues with taskq threads execute these requests
+		 * synchronously here in zvol_request().
+		 */
+		need_sync = bio_is_fua(bio) ||
+		    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
+			if (zvol_request_sync || need_sync ||
+			    taskq_dispatch(zvol_taskq, zvol_discard, zvr,
+			    TQ_SLEEP) == TASKQID_INVALID)
+				zvol_discard(zvr);
+		} else {
+			if (zvol_request_sync || need_sync ||
+			    taskq_dispatch(zvol_taskq, zvol_write, zvr,
+			    TQ_SLEEP) == TASKQID_INVALID)
+				zvol_write(zvr);
+		}
+	} else {
+		/*
+		 * The SCST driver, and possibly others, may issue READ I/Os
+		 * with a length of zero bytes.  These empty I/Os contain no
+		 * data and require no additional handling.
+		 */
+		if (size == 0) {
+			BIO_END_IO(bio, 0);
+			goto out;
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		zvr->lr = rangelock_enter(&zv->zv_rangelock, offset, size,
+		    RL_READER);
+		if (zvol_request_sync || taskq_dispatch(zvol_taskq,
+		    zvol_read, zvr, TQ_SLEEP) == TASKQID_INVALID)
+			zvol_read(zvr);
+	}
+
+out:
+	spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+	return (0);
+#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
+	return (BLK_QC_T_NONE);
+#endif
+}
+
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
@@ -1157,6 +1181,9 @@ zvol_setup_zv(zvol_state_t *zv)
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 
+	zv->zv_zilog = NULL;
+	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
+
 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 	if (error)
 		return (SET_ERROR(error));
@@ -1171,7 +1198,6 @@ zvol_setup_zv(zvol_state_t *zv)
 
 	set_capacity(zv->zv_disk, volsize >> 9);
 	zv->zv_volsize = volsize;
-	zv->zv_zilog = zil_open(os, zvol_get_data);
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
@@ -1194,7 +1220,11 @@ zvol_shutdown_zv(zvol_state_t *zv)
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
-	zil_close(zv->zv_zilog);
+	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
+		ASSERT(zv->zv_zilog != NULL);
+		zil_close(zv->zv_zilog);
+	}
+
 	zv->zv_zilog = NULL;
 
 	dnode_rele(zv->zv_dn, FTAG);
@@ -1204,7 +1234,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
 	 * Evict cached data. We must write out any dirty data before
 	 * disowning the dataset.
 	 */
-	if (!(zv->zv_flags & ZVOL_RDONLY))
+	if (zv->zv_flags & ZVOL_WRITTEN_TO)
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	(void) dmu_objset_evict_dbufs(zv->zv_objset);
 }