Sequential scrub and resilvers

Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Authored-by: Saso Kiselkov <saso.kiselkov@nexenta.com> Authored-by: Alek Pinchuk <apinchuk@datto.com> Authored-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #3625 Closes #6256
2025-11-07 06:44:54 +03:00 · 2017-11-15 20:27:01 -05:00 · 2017-11-15 20:27:01 -05:00 · d4a72f2386
commit d4a72f2386
parent e301113c17
37 changed files with 3051 additions and 831 deletions
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@ -2226,8 +2226,6 @@ dump_dir(objset_t *os)
 		max_slot_used = object + dnode_slots - 1;
 	}

-	ASSERT3U(object_count, ==, usedobjs);
-
 	(void) printf("\n");

 	(void) printf("    Dnode slots:\n");
@ -2245,6 +2243,8 @@ dump_dir(objset_t *os)
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
+
+	ASSERT3U(object_count, ==, usedobjs);
 }

 static void
@ -3089,7 +3089,7 @@ zdb_blkptr_done(zio_t *zio)
 	abd_free(zio->io_abd);

 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
+	spa->spa_load_verify_ios--;
 	cv_broadcast(&spa->spa_scrub_io_cv);

 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@ -3160,9 +3160,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 			flags |= ZIO_FLAG_SPECULATIVE;

 		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight > max_inflight)
+		while (spa->spa_load_verify_ios > max_inflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		spa->spa_scrub_inflight++;
+		spa->spa_load_verify_ios++;
 		mutex_exit(&spa->spa_scrub_lock);

 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@ -57,6 +57,8 @@
 #include <sys/fm/protocol.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/mount.h>
+#include <sys/sysmacros.h>
+
 #include <math.h>

 #include <libzfs.h>
@ -1761,7 +1763,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 	(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &c);

-	if (ps && ps->pss_state == DSS_SCANNING &&
+	if (ps != NULL && ps->pss_state == DSS_SCANNING &&
 	    vs->vs_scan_processed != 0 && children == 0) {
 		(void) printf(gettext("  (%s)"),
 		    (ps->pss_func == POOL_SCAN_RESILVER) ?
@ -5967,11 +5969,13 @@ void
 print_scan_status(pool_scan_stat_t *ps)
 {
 	time_t start, end, pause;
-	uint64_t elapsed, mins_left, hours_left;
-	uint64_t pass_exam, examined, total;
-	uint_t rate;
+	uint64_t total_secs_left;
+	uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
+	uint64_t pass_scanned, scanned, pass_issued, issued, total;
+	uint_t scan_rate, issue_rate;
 	double fraction_done;
-	char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+	char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+	char srate_buf[7], irate_buf[7];

 	(void) printf(gettext("  scan: "));

@ -5985,30 +5989,35 @@ print_scan_status(pool_scan_stat_t *ps)
 	start = ps->pss_start_time;
 	end = ps->pss_end_time;
 	pause = ps->pss_pass_scrub_pause;
+
 	zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));

 	assert(ps->pss_func == POOL_SCAN_SCRUB ||
 	    ps->pss_func == POOL_SCAN_RESILVER);
-	/*
-	 * Scan is finished or canceled.
-	 */
+
+	/* Scan is finished or canceled. */
 	if (ps->pss_state == DSS_FINISHED) {
-		uint64_t minutes_taken = (end - start) / 60;
-		char *fmt = NULL;
+		total_secs_left = end - start;
+		days_left = total_secs_left / 60 / 60 / 24;
+		hours_left = (total_secs_left / 60 / 60) % 24;
+		mins_left = (total_secs_left / 60) % 60;
+		secs_left = (total_secs_left % 60);

 		if (ps->pss_func == POOL_SCAN_SCRUB) {
-			fmt = gettext("scrub repaired %s in %lluh%um with "
-			    "%llu errors on %s");
+			(void) printf(gettext("scrub repaired %s "
+			    "in %llu days %02llu:%02llu:%02llu "
+			    "with %llu errors on %s"), processed_buf,
+			    (u_longlong_t)days_left, (u_longlong_t)hours_left,
+			    (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+			    (u_longlong_t)ps->pss_errors, ctime(&end));
 		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
-			fmt = gettext("resilvered %s in %lluh%um with "
-			    "%llu errors on %s");
+			(void) printf(gettext("resilvered %s "
+			    "in %llu days %02llu:%02llu:%02llu "
+			    "with %llu errors on %s"), processed_buf,
+			    (u_longlong_t)days_left, (u_longlong_t)hours_left,
+			    (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+			    (u_longlong_t)ps->pss_errors, ctime(&end));
 		}
-		/* LINTED */
-		(void) printf(fmt, processed_buf,
-		    (u_longlong_t)(minutes_taken / 60),
-		    (uint_t)(minutes_taken % 60),
-		    (u_longlong_t)ps->pss_errors,
-		    ctime((time_t *)&end));
 		return;
 	} else if (ps->pss_state == DSS_CANCELED) {
 		if (ps->pss_func == POOL_SCAN_SCRUB) {
@ -6023,18 +6032,14 @@ print_scan_status(pool_scan_stat_t *ps)

 	assert(ps->pss_state == DSS_SCANNING);

-	/*
-	 * Scan is in progress.
-	 */
+	/* Scan is in progress. Resilvers can't be paused. */
 	if (ps->pss_func == POOL_SCAN_SCRUB) {
 		if (pause == 0) {
 			(void) printf(gettext("scrub in progress since %s"),
 			    ctime(&start));
 		} else {
-			char buf[32];
-			struct tm *p = localtime(&pause);
-			(void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
-			(void) printf(gettext("scrub paused since %s\n"), buf);
+			(void) printf(gettext("scrub paused since %s"),
+			    ctime(&pause));
 			(void) printf(gettext("\tscrub started on %s"),
 			    ctime(&start));
 		}
@ -6043,50 +6048,68 @@ print_scan_status(pool_scan_stat_t *ps)
 		    ctime(&start));
 	}

-	examined = ps->pss_examined ? ps->pss_examined : 1;
+	scanned = ps->pss_examined;
+	pass_scanned = ps->pss_pass_exam;
+	issued = ps->pss_issued;
+	pass_issued = ps->pss_pass_issued;
 	total = ps->pss_to_examine;
-	fraction_done = (double)examined / total;

-	/* elapsed time for this pass */
+	/* we are only done with a block once we have issued the IO for it */
+	fraction_done = (double)issued / total;
+
+	/* elapsed time for this pass, rounding up to 1 if it's 0 */
 	elapsed = time(NULL) - ps->pss_pass_start;
 	elapsed -= ps->pss_pass_scrub_spent_paused;
-	elapsed = elapsed ? elapsed : 1;
-	pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
-	rate = pass_exam / elapsed;
-	rate = rate ? rate : 1;
-	mins_left = ((total - examined) / rate) / 60;
-	hours_left = mins_left / 60;
+	elapsed = (elapsed != 0) ? elapsed : 1;

-	zfs_nicebytes(examined, examined_buf, sizeof (examined_buf));
+	scan_rate = pass_scanned / elapsed;
+	issue_rate = pass_issued / elapsed;
+	total_secs_left = (issue_rate != 0) ?
+	    ((total - issued) / issue_rate) : UINT64_MAX;
+
+	days_left = total_secs_left / 60 / 60 / 24;
+	hours_left = (total_secs_left / 60 / 60) % 24;
+	mins_left = (total_secs_left / 60) % 60;
+	secs_left = (total_secs_left % 60);
+
+	/* format all of the numbers we will be reporting */
+	zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
+	zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
 	zfs_nicebytes(total, total_buf, sizeof (total_buf));
+	zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+	zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));

-	/*
-	 * do not print estimated time if hours_left is more than 30 days
-	 * or we have a paused scrub
-	 */
+	/* do not print estimated time if we have a paused scrub */
 	if (pause == 0) {
-		zfs_nicebytes(rate, rate_buf, sizeof (rate_buf));
-		(void) printf(gettext("\t%s scanned out of %s at %s/s"),
-		    examined_buf, total_buf, rate_buf);
-		if (hours_left < (30 * 24)) {
-			(void) printf(gettext(", %lluh%um to go\n"),
-			    (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+		(void) printf(gettext("\t%s scanned at %s/s, "
+		    "%s issued at %s/s, %s total\n"),
+		    scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
 	} else {
-			(void) printf(gettext(
-			    ", (scan is slow, no estimated time)\n"));
-		}
-	} else {
-		(void) printf(gettext("\t%s scanned out of %s\n"),
-		    examined_buf, total_buf);
+		(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+		    scanned_buf, issued_buf, total_buf);
 	}

 	if (ps->pss_func == POOL_SCAN_RESILVER) {
-		(void) printf(gettext("\t%s resilvered, %.2f%% done\n"),
+		(void) printf(gettext("\t%s resilvered, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
 	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
-		(void) printf(gettext("\t%s repaired, %.2f%% done\n"),
+		(void) printf(gettext("\t%s repaired, %.2f%% done"),
 		    processed_buf, 100 * fraction_done);
 	}
+
+	if (pause == 0) {
+		if (issue_rate >= 10 * 1024 * 1024) {
+			(void) printf(gettext(", %llu days "
+			    "%02llu:%02llu:%02llu to go\n"),
+			    (u_longlong_t)days_left, (u_longlong_t)hours_left,
+			    (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+		} else {
+			(void) printf(gettext(", no estimated "
+			    "completion time\n"));
+		}
+	} else {
+		(void) printf(gettext("\n"));
+	}
 }

 static void
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@ -66,11 +66,11 @@ typedef struct arc_prune arc_prune_t;
 * while transforming data into its desired format - specifically, when
 * decrypting, the key may not be present, or the HMAC may not be correct
 * which signifies deliberate tampering with the on-disk state
- * (assuming that the checksum was correct). The "error" parameter will be
- * nonzero in this case, even if there is no associated zio.
+ * (assuming that the checksum was correct). If any error occurs, the "buf"
+ * parameter will be NULL.
 */
-typedef void arc_read_done_func_t(zio_t *zio, int error, arc_buf_t *buf,
-    void *private);
+typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *bp, arc_buf_t *buf, void *private);
 typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_prune_func_t(int64_t bytes, void *private);

@ -106,44 +106,45 @@ typedef enum arc_flags
 	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
 	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
 	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 5,	/* I/O from zfetch */
+	ARC_FLAG_PRESCIENT_PREFETCH	= 1 << 6,	/* long min lifespan */

 	/*
 	 * Private ARC flags.  These flags are private ARC only flags that
 	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
 	 * only be set by ARC code.
 	 */
-	ARC_FLAG_IN_HASH_TABLE		= 1 << 6,	/* buffer is hashed */
-	ARC_FLAG_IO_IN_PROGRESS		= 1 << 7,	/* I/O in progress */
-	ARC_FLAG_IO_ERROR		= 1 << 8,	/* I/O failed for buf */
-	ARC_FLAG_INDIRECT		= 1 << 9,	/* indirect block */
+	ARC_FLAG_IN_HASH_TABLE		= 1 << 7,	/* buffer is hashed */
+	ARC_FLAG_IO_IN_PROGRESS		= 1 << 8,	/* I/O in progress */
+	ARC_FLAG_IO_ERROR		= 1 << 9,	/* I/O failed for buf */
+	ARC_FLAG_INDIRECT		= 1 << 10,	/* indirect block */
 	/* Indicates that block was read with ASYNC priority. */
-	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 10,
-	ARC_FLAG_L2_WRITING		= 1 << 11,	/* write in progress */
-	ARC_FLAG_L2_EVICTED		= 1 << 12,	/* evicted during I/O */
-	ARC_FLAG_L2_WRITE_HEAD		= 1 << 13,	/* head of write list */
+	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 11,
+	ARC_FLAG_L2_WRITING		= 1 << 12,	/* write in progress */
+	ARC_FLAG_L2_EVICTED		= 1 << 13,	/* evicted during I/O */
+	ARC_FLAG_L2_WRITE_HEAD		= 1 << 14,	/* head of write list */
 	/*
 	 * Encrypted or authenticated on disk (may be plaintext in memory).
 	 * This header has b_crypt_hdr allocated. Does not include indirect
 	 * blocks with checksums of MACs which will also have their X
 	 * (encrypted) bit set in the bp.
 	 */
-	ARC_FLAG_PROTECTED		= 1 << 14,
+	ARC_FLAG_PROTECTED		= 1 << 15,
 	/* data has not been authenticated yet */
-	ARC_FLAG_NOAUTH			= 1 << 15,
+	ARC_FLAG_NOAUTH			= 1 << 16,
 	/* indicates that the buffer contains metadata (otherwise, data) */
-	ARC_FLAG_BUFC_METADATA		= 1 << 16,
+	ARC_FLAG_BUFC_METADATA		= 1 << 17,

 	/* Flags specifying whether optional hdr struct fields are defined */
-	ARC_FLAG_HAS_L1HDR		= 1 << 17,
-	ARC_FLAG_HAS_L2HDR		= 1 << 18,
+	ARC_FLAG_HAS_L1HDR		= 1 << 18,
+	ARC_FLAG_HAS_L2HDR		= 1 << 19,

 	/*
 	 * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
 	 * This allows the l2arc to use the blkptr's checksum to verify
 	 * the data without having to store the checksum in the hdr.
 	 */
-	ARC_FLAG_COMPRESSED_ARC		= 1 << 19,
-	ARC_FLAG_SHARED_DATA		= 1 << 20,
+	ARC_FLAG_COMPRESSED_ARC		= 1 << 20,
+	ARC_FLAG_SHARED_DATA		= 1 << 21,

 	/*
 	 * The arc buffer's compression mode is stored in the top 7 bits of the
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@ -80,6 +80,7 @@ typedef struct zfs_blkstat {

 typedef struct zfs_all_blkstats {
 	zfs_blkstat_t	zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+	kmutex_t	zab_lock;
 } zfs_all_blkstats_t;


--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@ -108,22 +108,56 @@ typedef enum dsl_scan_flags {
 */
 typedef struct dsl_scan {
 	struct dsl_pool *scn_dp;
-
-	boolean_t scn_suspending;
 	uint64_t scn_restart_txg;
 	uint64_t scn_done_txg;
 	uint64_t scn_sync_start_time;
-	zio_t *scn_zio_root;
+	uint64_t scn_issued_before_pass;

 	/* for freeing blocks */
 	boolean_t scn_is_bptree;
 	boolean_t scn_async_destroying;
 	boolean_t scn_async_stalled;
-	uint64_t scn_visited_this_txg;

-	dsl_scan_phys_t scn_phys;
+	/* flags and stats for controlling scan state */
+	boolean_t scn_is_sorted;	/* doing sequential scan */
+	boolean_t scn_clearing;		/* scan is issuing sequential extents */
+	boolean_t scn_checkpointing;	/* scan is issuing all queued extents */
+	boolean_t scn_suspending;	/* scan is suspending until next txg */
+	uint64_t scn_last_checkpoint;	/* time of last checkpoint */
+
+	/* members for thread synchronization */
+	zio_t *scn_zio_root;		/* root zio for waiting on IO */
+	taskq_t *scn_taskq;		/* task queue for issuing extents */
+
+	/* for controlling scan prefetch, protected by spa_scrub_lock */
+	boolean_t scn_prefetch_stop;	/* prefetch should stop */
+	zbookmark_phys_t scn_prefetch_bookmark;	/* prefetch start bookmark */
+	avl_tree_t scn_prefetch_queue;	/* priority queue of prefetch IOs */
+	uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */
+
+	/* per txg statistics */
+	uint64_t scn_visited_this_txg;	/* total bps visited this txg */
+	uint64_t scn_holes_this_txg;
+	uint64_t scn_lt_min_this_txg;
+	uint64_t scn_gt_max_this_txg;
+	uint64_t scn_ddt_contained_this_txg;
+	uint64_t scn_objsets_visited_this_txg;
+	uint64_t scn_avg_seg_size_this_txg;
+	uint64_t scn_segs_this_txg;
+	uint64_t scn_avg_zio_size_this_txg;
+	uint64_t scn_zios_this_txg;
+
+	/* members needed for syncing scan status to disk */
+	dsl_scan_phys_t scn_phys;	/* on disk representation of scan */
+	dsl_scan_phys_t scn_phys_cached;
+	avl_tree_t scn_queue;		/* queue of datasets to scan */
+	uint64_t scn_bytes_pending;	/* outstanding data to issue */
 } dsl_scan_t;

+typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
+
+void scan_init(void);
+void scan_fini(void);
 int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
 void dsl_scan_fini(struct dsl_pool *dp);
 void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
@ -142,6 +176,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
    struct dmu_tx *tx);
 boolean_t dsl_scan_active(dsl_scan_t *scn);
 boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
+void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
+void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);

 #ifdef	__cplusplus
 }
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -859,17 +859,19 @@ typedef struct pool_scan_stat {
 	uint64_t	pss_start_time;	/* scan start time */
 	uint64_t	pss_end_time;	/* scan end time */
 	uint64_t	pss_to_examine;	/* total bytes to scan */
-	uint64_t	pss_examined;	/* total examined bytes	*/
+	uint64_t	pss_examined;	/* total bytes located by scanner */
 	uint64_t	pss_to_process; /* total bytes to process */
 	uint64_t	pss_processed;	/* total processed bytes */
 	uint64_t	pss_errors;	/* scan errors	*/

 	/* values not stored on disk */
 	uint64_t	pss_pass_exam; /* examined bytes per scan pass */
+	uint64_t	pss_pass_issued; /* issued bytes per scan pass */
 	uint64_t	pss_pass_start;	/* start time of a scan pass */
 	uint64_t	pss_pass_scrub_pause; /* pause time of a scurb pass */
 	/* cumulative time scrub spent paused, needed for rate calculation */
 	uint64_t	pss_pass_scrub_spent_paused;
+	uint64_t	pss_issued;	/* total bytes checked by scanner */
 } pool_scan_stat_t;

 typedef enum dsl_scan_state {
--- a/include/sys/range_tree.h
+++ b/include/sys/range_tree.h
@ -44,8 +44,13 @@ typedef struct range_tree_ops range_tree_ops_t;
 typedef struct range_tree {
 	avl_tree_t	rt_root;	/* offset-ordered segment AVL tree */
 	uint64_t	rt_space;	/* sum of all segments in the map */
+	uint64_t	rt_gap;		/* allowable inter-segment gap */
 	range_tree_ops_t *rt_ops;
+
+	/* rt_avl_compare should only be set if rt_arg is an AVL tree */
 	void		*rt_arg;
+	int (*rt_avl_compare)(const void *, const void *);
+

 	/*
 	 * The rt_histogram maintains a histogram of ranges. Each bucket,
@ -61,6 +66,7 @@ typedef struct range_seg {
 	avl_node_t	rs_pp_node;	/* AVL picker-private node */
 	uint64_t	rs_start;	/* starting offset of this segment */
 	uint64_t	rs_end;		/* ending offset (non-inclusive) */
+	uint64_t	rs_fill;	/* actual fill if gap mode is on */
 } range_seg_t;

 struct range_tree_ops {
@ -75,20 +81,37 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);

 void range_tree_init(void);
 void range_tree_fini(void);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+    int (*avl_compare) (const void *, const void *), kmutex_t *lp,
+    uint64_t gap);
 range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize);
 uint64_t range_tree_space(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
+void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp);

 void range_tree_add(void *arg, uint64_t start, uint64_t size);
 void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
 void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);

 void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
 void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+range_seg_t *range_tree_first(range_tree_t *rt);
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;

 #ifdef	__cplusplus
 }
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@ -185,9 +185,9 @@ struct spa {
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
-	uint64_t	spa_last_io;		/* lbolt of last non-scan I/O */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
-	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
+	uint64_t	spa_scrub_inflight;	/* in-flight scrub bytes */
+	uint64_t	spa_load_verify_ios;	/* in-flight verification IOs */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
@ -198,6 +198,7 @@ struct spa {
 	uint64_t	spa_scan_pass_scrub_pause; /* scrub pause time */
 	uint64_t	spa_scan_pass_scrub_spent_paused; /* total paused */
 	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
+	uint64_t	spa_scan_pass_issued;	/* issued bytes per pass */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@ -197,6 +197,13 @@ struct vdev {
 	uint64_t	vdev_async_write_queue_depth;
 	uint64_t	vdev_max_async_write_queue_depth;

+	/*
+	 * Protects the vdev_scan_io_queue field itself as well as the
+	 * structure's contents (when present).
+	 */
+	kmutex_t			vdev_scan_io_queue_lock;
+	struct dsl_scan_io_queue	*vdev_scan_io_queue;
+
 	/*
 	 * Leaf vdev state.
 	 */
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@ -214,7 +214,7 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
 	 */
 	(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &psc);
-	if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+	if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
 	    ps->pss_state == DSS_SCANNING)
 		return (ZPOOL_STATUS_RESILVERING);

--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@ -1,5 +1,6 @@
 '\" te
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
+.\" Copyright (c) 2017 Datto Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@ -626,11 +627,24 @@ Default value: \fB0\fR.
 .sp
 .ne 2
 .na
-\fBzfs_arc_min_prefetch_lifespan\fR (int)
+\fBzfs_arc_min_prefetch_ms\fR (int)
 .ad
 .RS 12n
-Minimum time prefetched blocks are locked in the ARC, specified in jiffies.
-A value of 0 will default to 1 second.
+Minimum time prefetched blocks are locked in the ARC, specified in ms.
+A value of \fB0\fR will default to 1 second.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_min_prescient_prefetch_ms\fR (int)
+.ad
+.RS 12n
+Minimum time "prescient prefetched" blocks are locked in the ARC, specified
+in ms. These blocks are meant to be prefetched fairly aggresively ahead of
+the code that may use them. A value of \fB0\fR will default to 6 seconds.
 .sp
 Default value: \fB0\fR.
 .RE
@ -1657,19 +1671,6 @@ last resort, as it typically results in leaked space, or worse.
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE

-.sp
-.ne 2
-.na
-\fBzfs_resilver_delay\fR (int)
-.ad
-.RS 12n
-Number of ticks to delay prior to issuing a resilver I/O operation when
-a non-resilver or non-scrub I/O operation has occurred within the past
-\fBzfs_scan_idle\fR ticks.
-.sp
-Default value: \fB2\fR.
-.RE
-
 .sp
 .ne 2
 .na
@ -1685,21 +1686,7 @@ Default value: \fB3,000\fR.
 .sp
 .ne 2
 .na
-\fBzfs_scan_idle\fR (int)
-.ad
-.RS 12n
-Idle window in clock ticks.  During a scrub or a resilver, if
-a non-scrub or non-resilver I/O operation has occurred during this
-window, the next scrub or resilver operation is delayed by, respectively
-\fBzfs_scrub_delay\fR or \fBzfs_resilver_delay\fR ticks.
-.sp
-Default value: \fB50\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBzfs_scan_min_time_ms\fR (int)
+\fBzfs_scrub_min_time_ms\fR (int)
 .ad
 .RS 12n
 Scrubs are processed by the sync thread. While scrubbing it will spend
@ -1711,14 +1698,120 @@ Default value: \fB1,000\fR.
 .sp
 .ne 2
 .na
-\fBzfs_scrub_delay\fR (int)
+\fBzfs_scan_checkpoint_intval\fR (int)
 .ad
 .RS 12n
-Number of ticks to delay prior to issuing a scrub I/O operation when
-a non-scrub or non-resilver I/O operation has occurred within the past
-\fBzfs_scan_idle\fR ticks.
+To preserve progress across reboots the sequential scan algorithm periodically
+needs to stop metadata scanning and issue all the verifications I/Os to disk.
+The frequency of this flushing is determined by the
+\fBfBzfs_scan_checkpoint_intval\fR tunable.
 .sp
-Default value: \fB4\fR.
+Default value: \fB7200\fR seconds (every 2 hours).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_fill_weight\fR (int)
+.ad
+.RS 12n
+This tunable affects how scrub and resilver I/O segments are ordered. A higher
+number indicates that we care more about how filled in a segment is, while a
+lower number indicates we care more about the size of the extent without
+considering the gaps within a segment. This value is only tunable upon module
+insertion. Changing the value afterwards will have no affect on scrub or
+resilver performance.
+.sp
+Default value: \fB3\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_issue_strategy\fR (int)
+.ad
+.RS 12n
+Determines the order that data will be verified while scrubbing or resilvering.
+If set to \fB1\fR, data will be verified as sequentially as possible, given the
+amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This
+may improve scrub performance if the pool's data is very fragmented. If set to
+\fB2\fR, the largest mostly-contiguous chunk of found data will be verified
+first. By deferring scrubbing of small segments, we may later find adjacent data
+to coalesce and increase the segment size. If set to \fB0\fR, zfs will use
+strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a
+checkpoint.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_legacy\fR (int)
+.ad
+.RS 12n
+A value of 0 indicates that scrubs and resilvers will gather metadata in
+memory before issuing sequential I/O. A value of 1 indicates that the legacy
+algorithm will be used where I/O is initiated as soon as it is discovered.
+Changing this value to 0 will not affect scrubs or resilvers that are already
+in progress.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_max_ext_gap\fR (int)
+.ad
+.RS 12n
+Indicates the largest gap in bytes between scrub / resilver I/Os that will still
+be considered sequential for sorting purposes. Changing this value will not
+affect scrubs or resilvers that are already in progress.
+.sp
+Default value: \fB2097152 (2 MB)\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_mem_lim_fact\fR (int)
+.ad
+.RS 12n
+Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
+This tunable determines the hard limit for I/O sorting memory usage.
+When the hard limit is reached we stop scanning metadata and start issuing
+data verification I/O. This is done until we get below the soft limit.
+.sp
+Default value: \fB20\fR which is 5% of RAM (1/20).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_mem_lim_soft_fact\fR (int)
+.ad
+.RS 12n
+The fraction of the hard limit used to determined the soft limit for I/O sorting
+by the sequential scan algorithm. When we cross this limit from bellow no action
+is taken. When we cross this limit from above it is because we are issuing
+verification I/O. In this case (unless the metadata scan is done) we stop
+issuing verification I/O and start scanning metadata again until we get to the
+hard limit.
+.sp
+Default value: \fB20\fR which is 5% of the hard limit (1/20).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_vdev_limit\fR (int)
+.ad
+.RS 12n
+Maximum amount of data that can be concurrently issued at once for scrubs and
+resilvers per leaf device, given in bytes.
+.sp
+Default value: \fB41943040\fR.
 .RE

 .sp
@ -1777,18 +1870,6 @@ value of 75% will create a maximum of one thread per cpu.
 Default value: \fB75\fR.
 .RE

-.sp
-.ne 2
-.na
-\fBzfs_top_maxinflight\fR (int)
-.ad
-.RS 12n
-Max concurrent I/Os per top-level vdev (mirrors or raidz arrays) allowed during
-scrub or resilver operations.
-.sp
-Default value: \fB32\fR.
-.RE
-
 .sp
 .ne 2
 .na
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@ -357,7 +357,8 @@ int			arc_no_grow_shift = 5;
 * minimum lifespan of a prefetch block in clock ticks
 * (initialized in arc_init())
 */
-static int		arc_min_prefetch_lifespan;
+static int		arc_min_prefetch_ms;
+static int		arc_min_prescient_prefetch_ms;

 /*
 * If this percent of memory is free, don't throttle.
@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
 * These tunables are Linux specific
 */
 unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_aggressive_disable = 1;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
@ -663,6 +665,7 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_meta_min;
 	kstat_named_t arcstat_sync_wait_for_async;
 	kstat_named_t arcstat_demand_hit_predictive_prefetch;
+	kstat_named_t arcstat_demand_hit_prescient_prefetch;
 	kstat_named_t arcstat_need_free;
 	kstat_named_t arcstat_sys_free;
 	kstat_named_t arcstat_raw_size;
@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 }
@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define	HDR_PRESCIENT_PREFETCH(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)

@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
+	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;

 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 	/* prefetch buffers have a minimum lifespan */
 	if (HDR_IO_IN_PROGRESS(hdr) ||
 	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-	    arc_min_prefetch_lifespan)) {
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 				/* link protected by hash lock */
 				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
-				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
 				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * MFU state.
 		 */

-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			new_state = arc_mru;
-			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
-				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+			}
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
-		if ((HDR_PREFETCH(hdr)) != 0) {
-			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			/* link protected by hash_lock */
-			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-		}
+
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 		 * MFU state.
 		 */

-		if (HDR_PREFETCH(hdr)) {
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
-			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
 			new_state = arc_mru;
 		}

@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
-arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
-	if (error == 0)
+	if (buf == NULL)
+		return;
+
 	bcopy(buf->b_data, arg, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }

 /* a generic arc_read_done_func_t */
+/* ARGSUSED */
 void
-arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
-	if (error != 0) {
-		arc_buf_destroy(buf, arg);
+
+	if (buf == NULL) {
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 	boolean_t	freeable = B_FALSE;
-	boolean_t	no_zio_error = (zio->io_error == 0);

 	/*
 	 * The hdr was inserted into hash-table and removed from lists
@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
 		}
 	}

-	if (no_zio_error) {
+	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);

-	if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+	if (hash_lock && zio->io_error == 0 &&
+	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
 		if (!acb->acb_done)
 			continue;

-		/* This is a demand read since prefetches don't use callbacks */
 		callback_cnt++;

+		if (zio->io_error != 0)
+			continue;
+
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
-		    acb->acb_compressed, acb->acb_noauth, no_zio_error,
+		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
+		if (error != 0) {
+			arc_buf_destroy(acb->acb_buf, acb->acb_private);
+			acb->acb_buf = NULL;
+		}

 		/*
 		 * Assert non-speculative zios didn't fail because an
@ -5770,10 +5788,9 @@ arc_read_done(zio_t *zio)
 			}
 		}

-		if (no_zio_error) {
+		if (zio->io_error == 0)
 			zio->io_error = error;
 	}
-	}
 	hdr->b_l1hdr.b_acb = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	if (callback_cnt == 0)
@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);

-	if (no_zio_error) {
+	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done) {
-			acb->acb_done(zio, zio->io_error, acb->acb_buf,
-			    acb->acb_private);
+			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+			    acb->acb_buf, acb->acb_private);
 		}

 		if (acb->acb_zio_dummy != NULL) {
@ -5974,12 +5991,25 @@ top:
 				arc_hdr_clear_flags(hdr,
 				    ARC_FLAG_PREDICTIVE_PREFETCH);
 			}
+
+			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_prescient_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+			}
+
 			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));

 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
 			    private, encrypted_read, compressed_read,
 			    noauth_read, B_TRUE, &buf);
+			if (rc != 0) {
+				arc_buf_destroy(buf, private);
+				buf = NULL;
+			}
+
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
@ -5987,6 +6017,8 @@ top:
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
@ -5996,7 +6028,7 @@ top:
 		    data, metadata, hits);

 		if (done)
-			done(NULL, rc, buf, private);
+			done(NULL, zb, bp, buf, private);
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
@ -6112,6 +6144,8 @@ top:
 		if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		if (BP_IS_AUTHENTICATED(bp))
@ -7223,9 +7257,15 @@ arc_tuning_update(void)
 	if (zfs_arc_p_min_shift)
 		arc_p_min_shift = zfs_arc_p_min_shift;

-	/* Valid range: 1 - N ticks */
-	if (zfs_arc_min_prefetch_lifespan)
-		arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prefetch_ms)
+		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prescient_prefetch_ms) {
+		arc_min_prescient_prefetch_ms =
+		    zfs_arc_min_prescient_prefetch_ms;
+	}

 	/* Valid range: 0 - 100 */
 	if ((zfs_arc_lotsfree_percent >= 0) &&
@ -7368,7 +7408,8 @@ arc_init(void)
 	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);

 	/* Convert seconds to clock ticks */
-	arc_min_prefetch_lifespan = 1 * hz;
+	arc_min_prefetch_ms = 1;
+	arc_min_prescient_prefetch_ms = 6;

 #ifdef _KERNEL
 	/*
@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 module_param(zfs_compressed_arc_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");

-module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
-MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+module_param(zfs_arc_min_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
+
+module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
+	"Min life of prescient prefetched block in ms");

 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 }

 static void
-dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;

@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
+		if (buf == NULL) {
+			buf = arc_alloc_buf(db->db_objset->os_spa,
+			    db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+		}
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
-	} else if (err == 0) {
+	} else if (buf != NULL) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 * prefetch if the next block down is our target.
 */
 static void
-dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	dbuf_prefetch_arg_t *dpa = private;

@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
 		dbuf_rele(db, FTAG);
 	}

-	dpa->dpa_curlevel--;
+	if (abuf == NULL) {
+		kmem_free(dpa, sizeof (*dpa));
+		return;
+	}

+	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
-	if (BP_IS_HOLE(bp) || err != 0) {
+
+	if (BP_IS_HOLE(bp)) {
 		kmem_free(dpa, sizeof (*dpa));
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@ -1172,14 +1172,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 	dmu_tx_t *tx;
-	zio_t *rio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	zio_t *rio;

 	ASSERT(spa_syncing_txg(spa) == txg);

 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);

+	rio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	/*
+	 * This function may cause an immediate scan of ddt blocks (see
+	 * the comment above dsl_scan_ddt() for details). We set the
+	 * scan's root zio here so that we can wait for any scan IOs in
+	 * addition to the regular ddt IOs.
+	 */
+	ASSERT3P(scn->scn_zio_root, ==, NULL);
+	scn->scn_zio_root = rio;
+
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
@ -1189,6 +1201,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 	}

 	(void) zio_wait(rio);
+	scn->scn_zio_root = NULL;

 	dmu_tx_commit(tx);
 }
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@ -520,7 +520,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 {
 	prefetch_data_t *pfd = arg;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
-	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+	    ARC_FLAG_PRESCIENT_PREFETCH;

 	ASSERT(pfd->pd_bytes_fetched >= 0);
 	if (bp == NULL)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@ -390,8 +390,10 @@ dsl_pool_close(dsl_pool_t *dp)
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
 	taskq_destroy(dp->dp_iput_taskq);
-	if (dp->dp_blkstats)
+	if (dp->dp_blkstats) {
+		mutex_destroy(&dp->dp_blkstats->zab_lock);
 		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+	}
 	kmem_free(dp, sizeof (dsl_pool_t));
 }

--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -971,85 +971,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 	return (AVL_CMP(r1->rs_start, r2->rs_start));
 }

-/*
- * Create any block allocator specific components. The current allocators
- * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
- */
-static void
-metaslab_rt_create(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT(msp->ms_tree == NULL);
-
-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-/*
- * Destroy the block allocator specific components.
- */
-static void
-metaslab_rt_destroy(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	ASSERT0(avl_numnodes(&msp->ms_size_tree));
-
-	avl_destroy(&msp->ms_size_tree);
-}
-
-static void
-metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	VERIFY(!msp->ms_condensing);
-	avl_add(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-	VERIFY(!msp->ms_condensing);
-	avl_remove(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_vacate(range_tree_t *rt, void *arg)
-{
-	metaslab_t *msp = arg;
-
-	ASSERT3P(rt->rt_arg, ==, msp);
-	ASSERT3P(msp->ms_tree, ==, rt);
-
-	/*
-	 * Normally one would walk the tree freeing nodes along the way.
-	 * Since the nodes are shared with the range trees we can avoid
-	 * walking all nodes and just reinitialize the avl tree. The nodes
-	 * will be freed by the range tree, so we don't want to free them here.
-	 */
-	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-static range_tree_ops_t metaslab_rt_ops = {
-	metaslab_rt_create,
-	metaslab_rt_destroy,
-	metaslab_rt_add,
-	metaslab_rt_remove,
-	metaslab_rt_vacate
-};
-
 /*
 * ==========================================================================
 * Common allocator routines
@ -1425,7 +1346,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+	ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
+	    metaslab_rangesize_compare, &ms->ms_lock, 0);
 	metaslab_group_add(mg, ms);

 	metaslab_set_fragmentation(ms);
--- a/module/zfs/range_tree.c
+++ b/module/zfs/range_tree.c
@ -33,8 +33,58 @@
 #include <sys/zio.h>
 #include <sys/range_tree.h>

+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
 kmem_cache_t *range_seg_cache;

+/* Generic ops for managing an AVL tree alongside a range tree */
+struct range_tree_ops rt_avl_ops = {
+	.rtop_create = rt_avl_create,
+	.rtop_destroy = rt_avl_destroy,
+	.rtop_add = rt_avl_add,
+	.rtop_remove = rt_avl_remove,
+	.rtop_vacate = rt_avl_vacate,
+};
+
 void
 range_tree_init(void)
 {
@ -75,6 +125,18 @@ range_tree_stat_verify(range_tree_t *rt)
 	}
 }

+/*
+ * Changes out the lock used by the range tree. Useful when you are moving
+ * the range tree between containing structures without having to recreate
+ * it. Both the old and new locks must be held by the caller.
+ */
+void
+range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
+	rt->rt_lock = lp;
+}
+
 static void
 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 {
@ -121,31 +183,38 @@ range_tree_seg_compare(const void *x1, const void *x2)
 }

 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+    int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap)
 {
-	range_tree_t *rt;
-
-	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);

 	avl_create(&rt->rt_root, range_tree_seg_compare,
 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));

 	rt->rt_lock = lp;
 	rt->rt_ops = ops;
+	rt->rt_gap = gap;
 	rt->rt_arg = arg;
+	rt->rt_avl_compare = avl_compare;

-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 		rt->rt_ops->rtop_create(rt, rt->rt_arg);

 	return (rt);
 }

+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+{
+	return (range_tree_create_impl(ops, arg, NULL, lp, 0));
+}
+
 void
 range_tree_destroy(range_tree_t *rt)
 {
 	VERIFY0(rt->rt_space);

-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);

 	avl_destroy(&rt->rt_root);
@ -153,40 +222,102 @@ range_tree_destroy(range_tree_t *rt)
 }

 void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	ASSERT3U(rs->rs_fill + delta, !=, 0);
+	ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+	rs->rs_fill += delta;
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
 	range_tree_t *rt = arg;
 	avl_index_t where;
 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
-	uint64_t end = start + size;
+	uint64_t end = start + size, gap = rt->rt_gap;
+	uint64_t bridge_size = 0;
 	boolean_t merge_before, merge_after;

 	ASSERT(MUTEX_HELD(rt->rt_lock));
-	VERIFY(size != 0);
+	ASSERT3U(size, !=, 0);
+	ASSERT3U(fill, <=, size);

 	rsearch.rs_start = start;
 	rsearch.rs_end = end;
 	rs = avl_find(&rt->rt_root, &rsearch, &where);

-	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+	if (gap == 0 && rs != NULL &&
+	    rs->rs_start <= start && rs->rs_end >= end) {
 		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size);
+		    "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
+		    (longlong_t)start, (longlong_t)size,
+		    (longlong_t)rs->rs_start,
+		    (longlong_t)rs->rs_end - rs->rs_start);
 		return;
 	}

-	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(rs == NULL);
+	/*
+	 * If this is a gap-supporting range tree, it is possible that we
+	 * are inserting into an existing segment. In this case simply
+	 * bump the fill count and call the remove / add callbacks. If the
+	 * new range will extend an existing segment, we remove the
+	 * existing one, apply the new extent to it and re-insert it using
+	 * the normal code paths.
+	 */
+	if (rs != NULL) {
+		ASSERT3U(gap, !=, 0);
+		if (rs->rs_start <= start && rs->rs_end >= end) {
+			range_tree_adjust_fill(rt, rs, fill);
+			return;
+		}

+		avl_remove(&rt->rt_root, rs);
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs);
+		rt->rt_space -= rs->rs_end - rs->rs_start;
+
+		fill += rs->rs_fill;
+		start = MIN(start, rs->rs_start);
+		end = MAX(end, rs->rs_end);
+		size = end - start;
+
+		range_tree_add_impl(rt, start, size, fill);
+
+		kmem_cache_free(range_seg_cache, rs);
+		return;
+	}
+
+	ASSERT3P(rs, ==, NULL);
+
+	/*
+	 * Determine whether or not we will have to merge with our neighbors.
+	 * If gap != 0, we might need to merge with our neighbors even if we
+	 * aren't directly touching.
+	 */
 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);

-	merge_before = (rs_before != NULL && rs_before->rs_end == start);
-	merge_after = (rs_after != NULL && rs_after->rs_start == end);
+	merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
+	merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+
+	if (merge_before && gap != 0)
+		bridge_size += start - rs_before->rs_end;
+	if (merge_after && gap != 0)
+		bridge_size += rs_after->rs_start - end;

 	if (merge_before && merge_after) {
 		avl_remove(&rt->rt_root, rs_before);
-		if (rt->rt_ops != NULL) {
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 		}
@ -194,43 +325,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
 		range_tree_stat_decr(rt, rs_before);
 		range_tree_stat_decr(rt, rs_after);

+		rs_after->rs_fill += rs_before->rs_fill + fill;
 		rs_after->rs_start = rs_before->rs_start;
 		kmem_cache_free(range_seg_cache, rs_before);
 		rs = rs_after;
 	} else if (merge_before) {
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);

 		range_tree_stat_decr(rt, rs_before);

+		rs_before->rs_fill += fill;
 		rs_before->rs_end = end;
 		rs = rs_before;
 	} else if (merge_after) {
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);

 		range_tree_stat_decr(rt, rs_after);

+		rs_after->rs_fill += fill;
 		rs_after->rs_start = start;
 		rs = rs_after;
 	} else {
 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+
+		rs->rs_fill = fill;
 		rs->rs_start = start;
 		rs->rs_end = end;
 		avl_insert(&rt->rt_root, rs, where);
 	}

-	if (rt->rt_ops != NULL)
+	if (gap != 0)
+		ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
+	else
+		ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);

 	range_tree_stat_incr(rt, rs);
-	rt->rt_space += size;
+	rt->rt_space += size + bridge_size;
 }

 void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+    boolean_t do_fill)
 {
-	range_tree_t *rt = arg;
 	avl_index_t where;
 	range_seg_t rsearch, *rs, *newseg;
 	uint64_t end = start + size;
@ -251,6 +398,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 		    (longlong_t)start, (longlong_t)size);
 		return;
 	}
+
+	/*
+	 * Range trees with gap support must only remove complete segments
+	 * from the tree. This allows us to maintain accurate fill accounting
+	 * and to ensure that bridged sections are not leaked. If we need to
+	 * remove less than the full segment, we can only adjust the fill count.
+	 */
+	if (rt->rt_gap != 0) {
+		if (do_fill) {
+			if (rs->rs_fill == size) {
+				start = rs->rs_start;
+				end = rs->rs_end;
+				size = end - start;
+			} else {
+				range_tree_adjust_fill(rt, rs, -size);
+				return;
+			}
+		} else if (rs->rs_start != start || rs->rs_end != end) {
+			zfs_panic_recover("zfs: freeing partial segment of "
+			    "gap tree (offset=%llu size=%llu) of "
+			    "(offset=%llu size=%llu)",
+			    (longlong_t)start, (longlong_t)size,
+			    (longlong_t)rs->rs_start,
+			    (longlong_t)rs->rs_end - rs->rs_start);
+			return;
+		}
+	}
+
 	VERIFY3U(rs->rs_start, <=, start);
 	VERIFY3U(rs->rs_end, >=, end);

@ -259,19 +434,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)

 	range_tree_stat_decr(rt, rs);

-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);

 	if (left_over && right_over) {
 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 		newseg->rs_start = end;
 		newseg->rs_end = rs->rs_end;
+		newseg->rs_fill = newseg->rs_end - newseg->rs_start;
 		range_tree_stat_incr(rt, newseg);

 		rs->rs_end = start;

 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
 	} else if (left_over) {
 		rs->rs_end = start;
@ -284,15 +460,55 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 	}

 	if (rs != NULL) {
+		/*
+		 * The fill of the leftover segment will always be equal to
+		 * the size, since we do not support removing partial segments
+		 * of range trees with gaps.
+		 */
+		rs->rs_fill = rs->rs_end - rs->rs_start;
 		range_tree_stat_incr(rt, rs);

-		if (rt->rt_ops != NULL)
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 	}

 	rt->rt_space -= size;
 }

+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize)
+{
+	int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	range_tree_stat_decr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+	rs->rs_start = newstart;
+	rs->rs_end = newstart + newsize;
+
+	range_tree_stat_incr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+	rt->rt_space += delta;
+}
+
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
@ -308,7 +524,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 	return (avl_find(&rt->rt_root, &rsearch, &where));
 }

-static range_seg_t *
+range_seg_t *
 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 {
 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
@ -373,7 +589,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)

 	ASSERT(MUTEX_HELD(rt->rt_lock));

-	if (rt->rt_ops != NULL)
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);

 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
@ -397,8 +613,60 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 }

+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	return (avl_first(&rt->rt_root));
+}
+
 uint64_t
 range_tree_space(range_tree_t *rt)
 {
 	return (rt->rt_space);
 }
+
+/* Generic range tree functions for maintaining segments in an AVL tree. */
+void
+rt_avl_create(range_tree_t *rt, void *arg)
+{
+	avl_tree_t *tree = arg;
+
+	avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
+	    offsetof(range_seg_t, rs_pp_node));
+}
+
+void
+rt_avl_destroy(range_tree_t *rt, void *arg)
+{
+	avl_tree_t *tree = arg;
+
+	ASSERT0(avl_numnodes(tree));
+	avl_destroy(tree);
+}
+
+void
+rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	avl_tree_t *tree = arg;
+	avl_add(tree, rs);
+}
+
+void
+rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	avl_tree_t *tree = arg;
+	avl_remove(tree, rs);
+}
+
+void
+rt_avl_vacate(range_tree_t *rt, void *arg)
+{
+	/*
+	 * Normally one would walk the tree freeing nodes along the way.
+	 * Since the nodes are shared with the range trees we can avoid
+	 * walking all nodes and just reinitialize the avl tree. The nodes
+	 * will be freed by the range tree, so we don't want to free them here.
+	 */
+	rt_avl_create(rt, arg);
+}
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@ -1996,7 +1996,7 @@ spa_load_verify_done(zio_t *zio)
 	}

 	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
+	spa->spa_load_verify_ios--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
@ -2030,9 +2030,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	size_t size = BP_GET_PSIZE(bp);

 	mutex_enter(&spa->spa_scrub_lock);
-	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	spa->spa_scrub_inflight++;
+	spa->spa_load_verify_ios++;
 	mutex_exit(&spa->spa_scrub_lock);

 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@ -1892,6 +1892,7 @@ spa_init(int mode)
 	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
+	scan_init();
 	qat_init();
 }

@ -1915,6 +1916,7 @@ spa_fini(void)
 	unique_fini();
 	refcount_fini();
 	fm_fini();
+	scan_fini();
 	qat_fini();

 	avl_destroy(&spa_namespace_avl);
@ -2016,6 +2018,7 @@ spa_scan_stat_init(spa_t *spa)
 		spa->spa_scan_pass_scrub_pause = 0;
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
+	spa->spa_scan_pass_issued = 0;
 	vdev_scan_stat_init(spa->spa_root_vdev);
 }

@ -2033,18 +2036,21 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)

 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
+	ps->pss_state = scn->scn_phys.scn_state;
 	ps->pss_start_time = scn->scn_phys.scn_start_time;
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
-	ps->pss_examined = scn->scn_phys.scn_examined;
 	ps->pss_to_process = scn->scn_phys.scn_to_process;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
-	ps->pss_state = scn->scn_phys.scn_state;
+	ps->pss_examined = scn->scn_phys.scn_examined;
+	ps->pss_issued =
+	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;

 	/* data not stored on disk */
 	ps->pss_pass_start = spa->spa_scan_pass_start;
 	ps->pss_pass_exam = spa->spa_scan_pass_exam;
+	ps->pss_pass_issued = spa->spa_scan_pass_issued;
 	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
 	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;

--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -360,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);

 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
@ -647,6 +648,18 @@ vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;

+	/*
+	 * Scan queues are normally destroyed at the end of a scan. If the
+	 * queue exists here, that implies the vdev is being removed while
+	 * the scan is still running.
+	 */
+	if (vd->vdev_scan_io_queue != NULL) {
+		mutex_enter(&vd->vdev_scan_io_queue_lock);
+		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+		vd->vdev_scan_io_queue = NULL;
+		mutex_exit(&vd->vdev_scan_io_queue_lock);
+	}
+
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
@ -723,6 +736,7 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
+	mutex_destroy(&vd->vdev_scan_io_queue_lock);

 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@ -800,6 +814,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)

 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
+
+	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }

 static void
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
 * we include spans of optional I/Os to aid aggregation at the disk even when
 * they aren't able to help us aggregate at this level.
 */
-int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = 1 << 20;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;

--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@ -1070,7 +1070,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 		}
 		err = zap_add(os, intoobj, za.za_name,
 		    8, 1, &value, tx);
-		if (err)
+		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -39,6 +39,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zio.h>
@ -1050,6 +1051,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,

 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
+	dsl_scan_freed(spa, bp);

 	/*
 	 * GANG and DEDUP blocks can induce a read (for the gang block header,
@ -3333,26 +3335,6 @@ zio_vdev_io_start(zio_t *zio)

 	ASSERT3P(zio->io_logical, !=, zio);

-	/*
-	 * We keep track of time-sensitive I/Os so that the scan thread
-	 * can quickly react to certain workloads.  In particular, we care
-	 * about non-scrubbing, top-level reads and writes with the following
-	 * characteristics:
-	 *	- synchronous writes of user data to non-slog devices
-	 *	- any reads of user data
-	 * When these conditions are met, adjust the timestamp of spa_last_io
-	 * which allows the scan thread to adjust its workload accordingly.
-	 */
-	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
-	    vd == vd->vdev_top && !vd->vdev_islog &&
-	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
-	    zio->io_txg != spa_syncing_txg(spa)) {
-		uint64_t old = spa->spa_last_io;
-		uint64_t new = ddi_get_lbolt64();
-		if (old != new)
-			(void) atomic_cas_64(&spa->spa_last_io, old, new);
-	}
-
 	align = 1ULL << vd->vdev_top->vdev_ashift;

 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh
@ -33,7 +33,7 @@
 # 8. Put another device offline and check if the test file checksum is correct.
 #
 # NOTES:
-#	A 25ms delay is added to make sure that the scrub is running while
+#	A 250ms delay is added to make sure that the scrub is running while
 #	the reopen kicks the resilver.
 #

@ -70,7 +70,7 @@ log_must md5sum $TESTFILE > $TESTFILE_MD5

 # 4. Execute scrub.
 # add delay to I/O requests for remaining disk in pool
-log_must zinject -d $DISK2 -D25:1 $TESTPOOL
+log_must zinject -d $DISK2 -D250:1 $TESTPOOL
 log_must zpool scrub $TESTPOOL

 # 5. "Plug back" disk.
@ -81,12 +81,12 @@ log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "online"
 # 7. Check if scrub scan is replaced by resilver.
 # the scrub operation has to be running while reopen is executed
 log_must is_pool_scrubbing $TESTPOOL true
+# remove delay from disk
+log_must zinject -c all
 # the scrub will be replaced by resilver, wait until it ends
 log_must wait_for_resilver_end $TESTPOOL $MAXTIMEOUT
 # check if the scrub scan has been interrupted by resilver
 log_must is_scan_restarted $TESTPOOL
-# remove delay from disk
-log_must zinject -c all

 # 8. Put another device offline and check if the test file checksum is correct.
 log_must zpool offline $TESTPOOL $DISK2
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh
@ -34,7 +34,7 @@
 #    replicas.
 #
 # NOTES:
-#	A 25ms delay is added to make sure that the scrub is running while
+#	A 125ms delay is added to make sure that the scrub is running while
 #	the reopen is invoked.
 #

@ -64,20 +64,19 @@ log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "unavail"
 log_must generate_random_file /$TESTPOOL/data $LARGE_FILE_SIZE
 # 4. Execute scrub.
 # add delay to I/O requests for remaining disk in pool
-log_must zinject -d $DISK2 -D25:1 $TESTPOOL
+log_must zinject -d $DISK2 -D125:1 $TESTPOOL
 log_must zpool scrub $TESTPOOL
 # 5. "Plug back" disk.
 insert_disk $REMOVED_DISK $scsi_host
 # 6. Reopen a pool with an -n flag.
 log_must zpool reopen -n $TESTPOOL
 log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "online"
+# remove delay from disk
+log_must zinject -c all
 # 7. Check if scrub scan is NOT replaced by resilver.
 log_must wait_for_scrub_end $TESTPOOL $MAXTIMEOUT
 log_mustnot is_scan_restarted $TESTPOOL

-# remove delay from disk
-log_must zinject -c all
-
 # 8. Check if trying to put device to offline fails because of no valid
 #    replicas.
 log_mustnot zpool offline $TESTPOOL $DISK2
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh
@ -26,7 +26,9 @@
 #

 . $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg

 verify_runnable "global"

+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 destroy_mirrors
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh
@ -37,8 +37,8 @@ verify_disk_count "$DISKS" 2

 default_mirror_setup_noexit $DISK1 $DISK2

-mntpnt=$(get_prop mountpoint $TESTPOOL)
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)

-# Create 100MB of data
-log_must file_write -b 1048576 -c 100 -o create -d 0 -f $mntpnt/bigfile
+# Create 256M of data
+log_must file_write -b 1048576 -c 256 -o create -d 0 -f $mntpnt/bigfile
 log_pass
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
@ -30,3 +30,6 @@

 export DISK1=${DISKS%% *}
 export DISK2=$(echo $DISKS | awk '{print $2}')
+
+export ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024))
+export ZFS_SCAN_VDEV_LIMIT_DEFAULT=$((4*1024*1024))
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh
@ -46,9 +46,9 @@
 #	6. Verify zpool scrub -s succeed when the system is scrubbing.
 #
 # NOTES:
-#	A 10ms delay is added to the ZIOs in order to ensure that the
-#	scrub does not complete before it has a chance to be cancelled.
-#	This can occur when testing with small pools or very fast hardware.
+#	Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#	low and adding a 50ms zio delay in order to ensure that the scrub does
+#	not complete early.
 #

 verify_runnable "global"
@ -56,13 +56,21 @@ verify_runnable "global"
 function cleanup
 {
 	log_must zinject -c all
+	log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
+	log_must rm -f $mntpnt/biggerfile
 }

 log_onexit cleanup

 log_assert "Verify scrub, scrub -p, and scrub -s show the right status."

-log_must zinject -d $DISK1 -D20:1 $TESTPOOL
+# Create 1G of additional data
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 1024 -o create -d 0 -f $mntpnt/biggerfile
+log_must sync
+
+log_must zinject -d $DISK1 -D50:1 $TESTPOOL
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool scrub $TESTPOOL
 log_must is_pool_scrubbing $TESTPOOL true
 log_must zpool scrub -p $TESTPOOL
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh
@ -43,23 +43,22 @@
 #	2. Kick off a second scrub and verify it fails
 #
 # NOTES:
-#	A 10ms delay is added to the ZIOs in order to ensure that the
-#	scrub does not complete before it has a chance to be restarted.
-#	This can occur when testing with small pools or very fast hardware.
+#	Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#	low in order to ensure that the scrub does not complete early.
 #

 verify_runnable "global"

 function cleanup
 {
-	        log_must zinject -c all
+	log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 }

 log_onexit cleanup

 log_assert "Scrub command fails when there is already a scrub in progress"

-log_must zinject -d $DISK1 -D10:1 $TESTPOOL
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool scrub $TESTPOOL
 log_must is_pool_scrubbing $TESTPOOL true
 log_mustnot zpool scrub $TESTPOOL
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
@ -42,13 +42,13 @@
 #	3. Verify scrub failed until the resilver completed
 #
 # NOTES:
-#	A 10ms delay is added to 10% of zio's in order to ensure that the
-#	resilver does not complete before the scrub can be issued.  This
-#	can occur when testing with small pools or very fast hardware.
+#	Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#	low in order to ensure that the scrub does not complete early.
+#

 function cleanup
 {
-	log_must zinject -c all
+	log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 }

 verify_runnable "global"
@ -62,13 +62,12 @@ log_onexit cleanup

 log_assert "Resilver prevent scrub from starting until the resilver completes"

+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool detach $TESTPOOL $DISK2
-log_must zinject -d $DISK1 -D10:1 $TESTPOOL
 log_must zpool attach $TESTPOOL $DISK1 $DISK2
 log_must is_pool_resilvering $TESTPOOL
 log_mustnot zpool scrub $TESTPOOL

-# Allow the resilver to finish, or it will interfere with the next test.
 while ! is_pool_resilvered $TESTPOOL; do
 	sleep 1
 done
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh
@ -63,4 +63,8 @@ log_must zpool scrub $TESTPOOL
 log_must zpool detach $TESTPOOL $DISK1
 log_must zpool attach $TESTPOOL $DISK2 $DISK1

+while ! is_pool_resilvered $TESTPOOL; do
+	sleep 1
+done
+
 log_pass "When scrubbing, detach device should not break system."
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh
@ -49,7 +49,7 @@ verify_runnable "global"
 function cleanup
 {
 	poolexists $TESTPOOL && destroy_pool $TESTPOOL
-	log_must rm -f $DISK1 $DISK2 $DISK3
+	log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4
 }

 #
@ -94,14 +94,16 @@ TESTDIR="$TEST_BASE_DIR/zpool_scrub_offline_device"
 DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
 DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
 DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"

 # 1. Create the pool
 log_must truncate -s $DEVSIZE $DISK1
 log_must truncate -s $DEVSIZE $DISK2
 log_must truncate -s $DEVSIZE $DISK3
+log_must truncate -s $DEVSIZE $DISK4
 poolexists $TESTPOOL && destroy_pool $TESTPOOL
 log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \
-    raidz1 $DISK1 $DISK2 $DISK3
+    raidz2 $DISK1 $DISK2 $DISK3 $DISK4

 # 2. Offline the first device
 zpool_do_sync 'offline' $TESTPOOL $DISK1
--- a/tests/zfs-tests/tests/functional/events/events_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/events/events_002_pos.ksh
@ -81,6 +81,10 @@ log_must truncate -s 0 $ZED_DEBUG_LOG
 # 4. Generate additional events.
 log_must zpool offline $MPOOL $VDEV1
 log_must zpool online $MPOOL $VDEV1
+while ! is_pool_resilvered $MPOOL; do
+	sleep 1
+done
+
 log_must zpool scrub $MPOOL

 # Wait for the scrub to wrap, or is_healthy will be wrong.
--- a/tests/zfs-tests/tests/functional/events/events_common.kshlib
+++ b/tests/zfs-tests/tests/functional/events/events_common.kshlib
@ -78,7 +78,6 @@ function run_and_verify
 	zedlog=${zedlog:-$ZED_DEBUG_LOG}
 	fullcmd="$1"
 	cmd=$(echo $fullcmd | awk '{print $1}')
-	subcmd=$(echo $fullcmd | awk '{print $2}')

 	# If we aren't running zpool or zfs, something is wrong
 	[[ $cmd == "zpool" || $cmd == "zfs" ]] || \