Illumos 5987 - zfs prefetch code needs work

5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>

References:
  https://www.illumos.org/issues/5987 zfs prefetch code needs work
  illumos/illumos-gate@cf6106c 5987 zfs prefetch code needs work

Porting notes:
- [module/zfs/dbuf.c]
  - 5f6d0b6 Handle block pointers with a corrupt logical size
- [module/zfs/dmu_zfetch.c]
  - c65aa5b Fix gcc missing parenthesis warnings
  - 428870f Update core ZFS code from build 121 to build 141.
  - 79c76d5 Change KM_PUSHPAGE -> KM_SLEEP
  - b8d06fc Switch KM_SLEEP to KM_PUSHPAGE
  - Account for ISO C90 - mixed declarations and code - warnings
  - Module parameters (new/changed):
    - Replaced zfetch_block_cap with zfetch_max_distance
      (Max bytes to prefetch per stream (default 8MB; 8 * 1024 * 1024))
    - Preserved zfs_prefetch_disable as 'int' for consistency with
      existing Linux module options.
- [include/sys/trace_arc.h]
  - Added new tracepoints
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async);
    - DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch);
- [man/man5/zfs-module-parameters.5]
  - Updated man page

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Matthew Ahrens
2015-12-26 22:10:31 +01:00
committed by Brian Behlendorf
parent ab5cbbd107
commit 7f60329a26
10 changed files with 285 additions and 663 deletions
+67 -7
View File
@@ -474,6 +474,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
kstat_named_t arcstat_need_free;
kstat_named_t arcstat_sys_free;
} arc_stats_t;
@@ -568,6 +570,8 @@ static arc_stats_t arc_stats = {
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
{ "arc_sys_free", KSTAT_DATA_UINT64 }
};
@@ -4244,6 +4248,36 @@ top:
if (HDR_IO_IN_PROGRESS(hdr)) {
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
@@ -4252,7 +4286,7 @@ top:
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
if (done) {
arc_callback_t *acb = NULL;
arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
@@ -4277,6 +4311,19 @@ top:
hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
* wait for i/o because we did a predictive
* prefetch i/o for it, which has completed.
*/
DTRACE_PROBE1(
arc__demand__hit__predictive__prefetch,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(
arcstat_demand_hit_predictive_prefetch);
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
add_reference(hdr, hash_lock, private);
/*
* If this block is already in use, create a new
@@ -4349,12 +4396,16 @@ top:
goto top; /* restart the IO request */
}
/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH) {
/*
* If there is a callback, we pass our reference to
* it; otherwise we remove our reference.
*/
if (done == NULL) {
(void) remove_reference(hdr, hash_lock,
private);
hdr->b_flags |= ARC_FLAG_PREFETCH;
}
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4377,11 +4428,13 @@ top:
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
/* if this is a prefetch, we don't have a reference */
/*
* If there is a callback, we pass a reference to it.
*/
if (done != NULL)
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
else
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4399,6 +4452,8 @@ top:
arc_access(hdr, hash_lock);
}
if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -4438,6 +4493,11 @@ top:
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
if (priority == ZIO_PRIORITY_ASYNC_READ)
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
else
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true: