From c935fe2e9267b2ea6e3187e205aac019f5e0c83f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 22 Dec 2022 15:10:24 -0500 Subject: [PATCH] arc_read()/arc_access() refactoring and cleanup ARC code was many times significantly modified over the years, that created significant amount of tangled and potentially broken code. This should make arc_access()/arc_read() code some more readable. - Decouple prefetch status tracking from b_refcnt. It made sense originally, but became highly cryptic over the years. Move all the logic into arc_access(). While there, clean up and comment state transitions in arc_access(). Some transitions were weird IMO. - Unify arc_access() calls to arc_read() instead of sometimes calling it from arc_read_done(). To avoid extra state changes and checks add one more b_refcnt for ARC_FLAG_IO_IN_PROGRESS. - Reimplement ARC_FLAG_WAIT in case of ARC_FLAG_IO_IN_PROGRESS with the same callback mechanism to not falsely account them as hits. Count those as "iohits", an intermediate between "hits" and "misses". While there, call read callbacks in original request order, that should be good for fairness and random speculations/allocations/aggregations. - Introduce additional statistic counters for prefetch, accounting predictive vs prescient and hits vs iohits vs misses. - Remove hash_lock argument from functions not needing it. - Remove ARC_FLAG_PREDICTIVE_PREFETCH, since it should be opposite to ARC_FLAG_PRESCIENT_PREFETCH if ARC_FLAG_PREFETCH is set. We may wish to add ARC_FLAG_PRESCIENT_PREFETCH to few more places. - Fix few false positive tests found in the process. Reviewed-by: George Wilson Reviewed-by: Brian Behlendorf Reviewed-by: Richard Yao Reviewed-by: Ryan Moeller Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14123 --- include/os/linux/zfs/sys/trace_arc.h | 4 +- include/sys/arc.h | 1 - include/sys/arc_impl.h | 36 ++ module/zfs/arc.c | 550 +++++++++--------- module/zfs/dmu_traverse.c | 3 +- module/zfs/dmu_zfetch.c | 6 +- .../zfs_written_property_001_pos.ksh | 1 + .../functional/l2arc/l2arc_mfuonly_pos.ksh | 2 +- .../tests/functional/trim/autotrim_config.ksh | 1 + .../tests/functional/trim/trim_config.ksh | 1 + .../zvol/zvol_misc/zvol_misc_trim.ksh | 1 + 11 files changed, 312 insertions(+), 294 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index d8e733376..a1595c765 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -103,12 +103,12 @@ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ TP_PROTO(arc_buf_hdr_t *ab), \ TP_ARGS(ab)) DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); -DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); @@ -387,12 +387,12 @@ DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); #else DEFINE_DTRACE_PROBE1(arc__hit); +DEFINE_DTRACE_PROBE1(arc__iohit); DEFINE_DTRACE_PROBE1(arc__evict); DEFINE_DTRACE_PROBE1(arc__delete); DEFINE_DTRACE_PROBE1(new_state__mru); DEFINE_DTRACE_PROBE1(new_state__mfu); DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); -DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch); DEFINE_DTRACE_PROBE1(l2arc__hit); DEFINE_DTRACE_PROBE1(l2arc__miss); DEFINE_DTRACE_PROBE2(l2arc__read); diff --git a/include/sys/arc.h b/include/sys/arc.h index 532a2fe4b..b2f8f20e1 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -115,7 +115,6 @@ typedef enum arc_flags ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 03eebafa9..9c77284ff 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -101,9 +101,14 @@ struct arc_callback { boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; + boolean_t acb_wait; + int acb_wait_error; + kmutex_t acb_wait_lock; + kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; + arc_callback_t *acb_prev; arc_callback_t *acb_next; }; @@ -511,15 +516,27 @@ struct arc_buf_hdr { }; typedef struct arc_stats { + /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; + /* Number of requests for which I/O was already running. */ + kstat_named_t arcstat_iohits; + /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; + /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; + /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; + /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; + /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; @@ -844,8 +861,18 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; + /* Number of predictive prefetch requests. */ + kstat_named_t arcstat_predictive_prefetch; + /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; + /* Number of requests for which predictive prefetch was running. */ + kstat_named_t arcstat_demand_iohit_predictive_prefetch; + /* Number of prescient prefetch requests. */ + kstat_named_t arcstat_prescient_prefetch; + /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; + /* Number of requests for which prescient prefetch was running. */ + kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; @@ -855,14 +882,19 @@ typedef struct arc_stats { typedef struct arc_sums { wmsum_t arcstat_hits; + wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; + wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; + wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; + wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; + wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; @@ -936,8 +968,12 @@ typedef struct arc_sums { wmsum_t arcstat_prune; aggsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; + wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; + wmsum_t arcstat_demand_iohit_predictive_prefetch; + wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; + wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f51f427c1..1521caa6e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -483,14 +483,19 @@ arc_state_t ARC_l2c_only; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, @@ -601,8 +606,12 @@ arc_stats_t arc_stats = { { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, + { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, @@ -857,7 +866,7 @@ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -1138,9 +1147,8 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_l1hdr.b_arc_node); - list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -2283,31 +2291,20 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { - arc_state_t *state; + arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } - state = hdr->b_l1hdr.b_state; - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { + state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(&state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_decrement(hdr, state); } } @@ -2317,13 +2314,13 @@ add_reference(arc_buf_hdr_t *hdr, const void *tag) * list making it eligible for eviction. */ static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) +remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* @@ -2333,7 +2330,6 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); @@ -2394,8 +2390,7 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; @@ -2416,6 +2411,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + + IMPLY(GHOST_STATE(old_state), bufcnt == 0); + IMPLY(GHOST_STATE(new_state), bufcnt == 0); + IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(old_state == arc_anon, bufcnt <= 1); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2423,11 +2424,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, update_old = B_FALSE; } update_new = update_old; + if (GHOST_STATE(old_state)) + update_old = B_TRUE; + if (GHOST_STATE(new_state)) + update_new = B_TRUE; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); - ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); - ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -2437,12 +2440,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_old = B_TRUE; - } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { @@ -2454,12 +2451,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_new = B_TRUE; - } arc_evictable_space_increment(hdr, new_state); } } @@ -3853,7 +3844,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); + VERIFY0(remove_reference(hdr, tag)); arc_hdr_destroy(hdr); return; } @@ -3867,7 +3858,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } @@ -3894,20 +3885,20 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) * only the evicted headers size. */ static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) +arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* @@ -3934,7 +3925,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, hdr, hash_lock); + arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. @@ -3943,7 +3934,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } @@ -3954,10 +3945,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - MSEC_TO_TICK(min_lifetime))) { + MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4022,7 +4012,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); - arc_change_state(evicted_state, hdr, hash_lock); + arc_change_state(evicted_state, hdr); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); @@ -4110,8 +4100,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (mutex_tryenter(hash_lock)) { uint64_t revicted; - uint64_t evicted = arc_evict_hdr(hdr, hash_lock, - &revicted); + uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; @@ -5444,150 +5433,137 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) /* * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + /* + * Update buffer prefetch status. + */ + boolean_t was_prefetch = HDR_PREFETCH(hdr); + boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; + if (was_prefetch != now_prefetch) { + if (was_prefetch) { + ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, + HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, + prefetch); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); + if (was_prefetch) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } + if (now_prefetch) { + if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + ARCSTAT_BUMP(arcstat_prescient_prefetch); + } else { + ARCSTAT_BUMP(arcstat_predictive_prefetch); + } + } + if (arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + + clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); + arc_change_state(arc_mru, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); + /* + * This buffer has been accessed once recently and either + * its read is still in progress or it is in the cache. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_arc_access = now; + return; + } + hdr->b_l1hdr.b_mru_hits++; + ARCSTAT_BUMP(arcstat_mru_hits); /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). + * If the previous access was a prefetch, then it already + * handled possible promotion, so nothing more to do for now. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } + if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. + * If more than ARC_MINTIME have passed from the previous + * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + arc_change_state(arc_mfu, hdr); } - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. + * This buffer has been accessed once recently, but was + * evicted from the cache. Would we have bigger MRU, it + * would be an MRU hit, so handle it the same way, except + * we don't need to check the previous access time. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + hdr->b_l1hdr.b_mru_ghost_hits++; + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + if (was_prefetch) { new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - - hdr->b_l1hdr.b_mru_ghost_hits++; - ARCSTAT_BUMP(arcstat_mru_ghost_hits); + arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. + * This buffer has been accessed more than once and either + * still in the cache or being restored from one of ghosts. */ - - hdr->b_l1hdr.b_mfu_hits++; - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; - /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; + if (!HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_mfu_hits++; + ARCSTAT_BUMP(arcstat_mfu_hits); } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - + hdr->b_l1hdr.b_arc_access = now; + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { + /* + * This buffer has been accessed more than once recently, but + * has been evicted from the cache. Would we have bigger MFU + * it would stay in cache, so move it back to MFU state. + */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* - * This buffer is on the 2nd Level ARC. + * This buffer is on the 2nd Level ARC and was not accessed + * for a long time, so treat it as new and put into MRU. */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); @@ -5630,12 +5606,12 @@ arc_buf_access(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); + ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, + !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ @@ -5768,17 +5744,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } + hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we @@ -5788,6 +5754,10 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + + /* We need the last one to call below in original order. */ + callback_list = acb; + if (!acb->acb_done || acb->acb_nobuf) continue; @@ -5851,20 +5821,15 @@ arc_read_done(zio_t *zio) */ ASSERT(callback_cnt < 2 || hash_lock != NULL); - hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) - ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || - callback_list != NULL); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); @@ -5912,8 +5877,17 @@ arc_read_done(zio_t *zio) zio_nowait(acb->acb_zio_dummy); } - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); + callback_list = acb->acb_prev; + if (acb->acb_wait) { + mutex_enter(&acb->acb_wait_lock); + acb->acb_wait_error = zio->io_error; + acb->acb_wait = B_FALSE; + cv_signal(&acb->acb_wait_cv); + mutex_exit(&acb->acb_wait_lock); + /* acb will be freed by the waiting thread. */ + } else { + kmem_free(acb, sizeof (arc_callback_t)); + } } if (freeable) @@ -6003,12 +5977,10 @@ top: */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { + boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); @@ -6016,6 +5988,7 @@ top: goto out; } + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { @@ -6029,38 +6002,28 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } + + DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. - * Synchronous reads use the b_cv whereas nowait reads - * register a callback. Both are signalled/called in - * arc_read_done. + * Synchronous reads use the acb_wait_cv whereas nowait + * reads register a callback. Both are signalled/called + * in arc_read_done. * - * Errors of the physical I/O may need to be propagated - * to the pio. For synchronous reads, we simply restart - * this function and it will reassess. Nowait reads + * Errors of the physical I/O may need to be propagated. + * Synchronous read errors are returned here from + * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ - - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - + arc_callback_t *acb = NULL; + if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; @@ -6069,46 +6032,52 @@ top: acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; + if (*arc_flags & ARC_FLAG_WAIT) { + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, + CV_DEFAULT, NULL); + } acb->acb_zb = *zb; - if (pio != NULL) + if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); + } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; + if (hdr->b_l1hdr.b_acb) + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_iohits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, iohits); + + if (*arc_flags & ARC_FLAG_WAIT) { + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + rc = acb->acb_wait_error; + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); + } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_TRUE); + if (done && !no_buf) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } - ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ @@ -6130,8 +6099,7 @@ top: } } if (rc != 0) { - (void) remove_reference(hdr, hash_lock, - private); + (void) remove_reference(hdr, private); arc_buf_destroy_impl(buf); buf = NULL; } @@ -6139,25 +6107,12 @@ top: /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, hits); + *arc_flags |= ARC_FLAG_CACHED; if (done) done(NULL, zb, bp, buf, private); @@ -6201,7 +6156,6 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6236,21 +6190,23 @@ top: mutex_exit(hash_lock); goto top; } - - /* - * This is a delicate dance that we play here. - * This hdr might be in the ghost list so we access - * it to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); } + /* + * Call arc_adapt() explicitly before arc_access() to allow + * its logic to balance MRU/MFU based on the original state. + */ + arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); + /* + * Take additional reference for IO_IN_PROGRESS. It stops + * arc_access() from putting this header without any buffers + * and so other references but obviously nonevictable onto + * the evictable list of MRU or MFU state. + */ + add_reference(hdr, hdr); + if (!embedded_bp) + arc_access(hdr, *arc_flags, B_FALSE); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); @@ -6277,24 +6233,10 @@ top: zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -6307,7 +6249,6 @@ top: ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { @@ -6348,7 +6289,7 @@ top: blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); @@ -6370,7 +6311,8 @@ top: */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + !(l2arc_noprefetch && + (*arc_flags & ARC_FLAG_PREFETCH))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; @@ -6559,10 +6501,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp) /* * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is + * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, + * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU @@ -6579,9 +6519,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); + if (!HDR_HAS_L1HDR(hdr) || + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { @@ -6624,7 +6564,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; @@ -6684,7 +6624,7 @@ arc_release(arc_buf_t *buf, const void *tag) VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -6799,7 +6739,7 @@ arc_release(arc_buf_t *buf, const void *tag) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); @@ -6873,10 +6813,12 @@ arc_write_ready(zio_t *zio) callback->awcb_ready(zio, buf, callback->awcb_private); - if (HDR_IO_IN_PROGRESS(hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + } if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); @@ -7063,7 +7005,7 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); + arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); @@ -7083,12 +7025,14 @@ arc_write_done(zio_t *zio) } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -7303,22 +7247,32 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); + as->arcstat_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); + as->arcstat_demand_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); + as->arcstat_demand_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); + as->arcstat_prefetch_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); + as->arcstat_prefetch_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = @@ -7501,10 +7455,18 @@ arc_kstat_update(kstat_t *ksp, int rw) aggsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); + as->arcstat_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); + as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + as->arcstat_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); + as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = @@ -7736,14 +7698,19 @@ arc_state_init(void) zfs_refcount_create(&arc_l2c_only->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); + wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); @@ -7817,8 +7784,12 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_prune, 0); aggsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); + wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); @@ -7866,14 +7837,19 @@ arc_state_fini(void) multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); + wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); + wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); + wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); @@ -7947,8 +7923,12 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_prune); aggsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); + wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); @@ -9259,7 +9239,7 @@ retry: * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 377634c72..244b9b4cb 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -185,7 +185,8 @@ static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 1d63d7de6..76b8b5608 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -517,13 +517,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } if (!have_lock) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh index f53a4ac71..fd3194fe9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh @@ -163,6 +163,7 @@ before_clone=$(get_prop written $TESTPOOL/$TESTFS1) log_must zfs clone $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS1/snap1.clone log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/snap1.clone/testfile bs=1M \ count=40 +sync_pool after_clone=$(get_prop written $TESTPOOL/$TESTFS1) within_percent $before_clone $after_clone 99.5 || \ log_fail "unexpected written for clone $before_clone $after_clone" diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh index f2bada0eb..89ab94033 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh @@ -80,7 +80,7 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio log_must zpool export $TESTPOOL -log_must zpool import -d $VDIR $TESTPOOL +log_must zpool import -N -d $VDIR $TESTPOOL # Regardless of l2arc_noprefetch, some MFU buffers might be evicted # from ARC, accessed later on as prefetches and transition to MRU as diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index 924b56935..c97772585 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -95,6 +95,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R + sync_pool $TESTPOOL verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, wait for trim, verify the vdevs are now sparse. diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index 9a6e19e1c..6a187a05b 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -94,6 +94,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R + sync_pool $TESTPOOL verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, issue trim, verify the vdevs are now sparse. diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index b95eca73b..2c4ef28ab 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -83,6 +83,7 @@ function do_test { # Write to zvol log_must dd if=$datafile1 of=$zvolpath conv=fsync + sync_pool # Record how much space we've used (should be 5MB, with 128k # of tolerance).