Illumos 5987 - zfs prefetch code needs work

5987 zfs prefetch code needs work Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Approved by: Gordon Ross <gordon.ross@nexenta.com> References: https://www.illumos.org/issues/5987 zfs prefetch code needs work illumos/illumos-gate@cf6106c 5987 zfs prefetch code needs work Porting notes: - [module/zfs/dbuf.c] - 5f6d0b6 Handle block pointers with a corrupt logical size - [module/zfs/dmu_zfetch.c] - c65aa5b Fix gcc missing parenthesis warnings - 428870f Update core ZFS code from build 121 to build 141. - 79c76d5 Change KM_PUSHPAGE -> KM_SLEEP - b8d06fc Switch KM_SLEEP to KM_PUSHPAGE - Account for ISO C90 - mixed declarations and code - warnings - Module parameters (new/changed): - Replaced zfetch_block_cap with zfetch_max_distance (Max bytes to prefetch per stream (default 8MB; 8 * 1024 * 1024)) - Preserved zfs_prefetch_disable as 'int' for consistency with existing Linux module options. - [include/sys/trace_arc.h] - Added new tracepoints - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async); - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); - [man/man5/zfs-module-parameters.5] - Updated man page Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2026-05-27 04:32:16 +03:00 · 2015-12-26 22:10:31 +01:00
parent ab5cbbd107
commit 7f60329a26
10 changed files with 285 additions and 663 deletions
@@ -474,6 +474,8 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_meta_limit;
 	kstat_named_t arcstat_meta_max;
 	kstat_named_t arcstat_meta_min;
+	kstat_named_t arcstat_sync_wait_for_async;
+	kstat_named_t arcstat_demand_hit_predictive_prefetch;
 	kstat_named_t arcstat_need_free;
 	kstat_named_t arcstat_sys_free;
 } arc_stats_t;
@@ -568,6 +570,8 @@ static arc_stats_t arc_stats = {
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
+	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
+	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 }
 };
@@ -4244,6 +4248,36 @@ top:

 		if (HDR_IO_IN_PROGRESS(hdr)) {

+			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+			    priority == ZIO_PRIORITY_SYNC_READ) {
+				/*
+				 * This sync read must wait for an
+				 * in-progress async read (e.g. a predictive
+				 * prefetch).  Async reads are queued
+				 * separately at the vdev_queue layer, so
+				 * this is a form of priority inversion.
+				 * Ideally, we would "inherit" the demand
+				 * i/o's priority by moving the i/o from
+				 * the async queue to the synchronous queue,
+				 * but there is currently no mechanism to do
+				 * so.  Track this so that we can evaluate
+				 * the magnitude of this potential performance
+				 * problem.
+				 *
+				 * Note that if the prefetch i/o is already
+				 * active (has been issued to the device),
+				 * the prefetch improved performance, because
+				 * we issued it sooner than we would have
+				 * without the prefetch.
+				 */
+				DTRACE_PROBE1(arc__sync__wait__for__async,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+			}
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+			}
+
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
@@ -4252,7 +4286,7 @@ top:
 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);

 			if (done) {
-				arc_callback_t	*acb = NULL;
+				arc_callback_t *acb = NULL;

 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
@@ -4277,6 +4311,19 @@ top:
 		    hdr->b_l1hdr.b_state == arc_mfu);

 		if (done) {
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				/*
+				 * This is a demand read which does not have to
+				 * wait for i/o because we did a predictive
+				 * prefetch i/o for it, which has completed.
+				 */
+				DTRACE_PROBE1(
+				    arc__demand__hit__predictive__prefetch,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_predictive_prefetch);
+				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+			}
 			add_reference(hdr, hash_lock, private);
 			/*
 			 * If this block is already in use, create a new
@@ -4349,12 +4396,16 @@ top:
 				goto top; /* restart the IO request */
 			}

-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_FLAG_PREFETCH) {
+			/*
+			 * If there is a callback, we pass our reference to
+			 * it; otherwise we remove our reference.
+			 */
+			if (done == NULL) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
-				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			}
+			if (*arc_flags & ARC_FLAG_PREFETCH)
+				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4377,11 +4428,13 @@ top:
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);

-			/* if this is a prefetch, we don't have a reference */
+			/*
+			 * If there is a callback, we pass a reference to it.
+			 */
+			if (done != NULL)
+				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_FLAG_PREFETCH)
 				hdr->b_flags |= ARC_FLAG_PREFETCH;
-			else
-				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4399,6 +4452,8 @@ top:
 			arc_access(hdr, hash_lock);
 		}

+		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+			hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));

 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -4438,6 +4493,11 @@ top:
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, misses);

+		if (priority == ZIO_PRIORITY_ASYNC_READ)
+			hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
+		else
+			hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
+
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true: