diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index d8e733376..a1595c765 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -103,12 +103,12 @@ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ TP_PROTO(arc_buf_hdr_t *ab), \ TP_ARGS(ab)) DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__iohit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); -DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); @@ -387,12 +387,12 @@ DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); #else DEFINE_DTRACE_PROBE1(arc__hit); +DEFINE_DTRACE_PROBE1(arc__iohit); DEFINE_DTRACE_PROBE1(arc__evict); DEFINE_DTRACE_PROBE1(arc__delete); DEFINE_DTRACE_PROBE1(new_state__mru); DEFINE_DTRACE_PROBE1(new_state__mfu); DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); -DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch); DEFINE_DTRACE_PROBE1(l2arc__hit); DEFINE_DTRACE_PROBE1(l2arc__miss); DEFINE_DTRACE_PROBE2(l2arc__read); diff --git a/include/sys/arc.h b/include/sys/arc.h index 532a2fe4b..b2f8f20e1 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -115,7 +115,6 @@ typedef enum arc_flags ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 03eebafa9..9c77284ff 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -101,9 +101,14 @@ struct arc_callback { boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; + boolean_t acb_wait; + int acb_wait_error; + kmutex_t acb_wait_lock; + kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; + arc_callback_t *acb_prev; arc_callback_t *acb_next; }; @@ -511,15 +516,27 @@ struct arc_buf_hdr { }; typedef struct arc_stats { + /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; + /* Number of requests for which I/O was already running. */ + kstat_named_t arcstat_iohits; + /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; + /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; + /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; + /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; + /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; @@ -844,8 +861,18 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; + /* Number of predictive prefetch requests. */ + kstat_named_t arcstat_predictive_prefetch; + /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; + /* Number of requests for which predictive prefetch was running. */ + kstat_named_t arcstat_demand_iohit_predictive_prefetch; + /* Number of prescient prefetch requests. */ + kstat_named_t arcstat_prescient_prefetch; + /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; + /* Number of requests for which prescient prefetch was running. */ + kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; @@ -855,14 +882,19 @@ typedef struct arc_stats { typedef struct arc_sums { wmsum_t arcstat_hits; + wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; + wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; + wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; + wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; + wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; @@ -936,8 +968,12 @@ typedef struct arc_sums { wmsum_t arcstat_prune; aggsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; + wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; + wmsum_t arcstat_demand_iohit_predictive_prefetch; + wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; + wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f51f427c1..1521caa6e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -483,14 +483,19 @@ arc_state_t ARC_l2c_only; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, @@ -601,8 +606,12 @@ arc_stats_t arc_stats = { { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, + { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, @@ -857,7 +866,7 @@ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -1138,9 +1147,8 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_l1hdr.b_arc_node); - list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -2283,31 +2291,20 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { - arc_state_t *state; + arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } - state = hdr->b_l1hdr.b_state; - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { + state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(&state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_decrement(hdr, state); } } @@ -2317,13 +2314,13 @@ add_reference(arc_buf_hdr_t *hdr, const void *tag) * list making it eligible for eviction. */ static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) +remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* @@ -2333,7 +2330,6 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); @@ -2394,8 +2390,7 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; @@ -2416,6 +2411,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + + IMPLY(GHOST_STATE(old_state), bufcnt == 0); + IMPLY(GHOST_STATE(new_state), bufcnt == 0); + IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(old_state == arc_anon, bufcnt <= 1); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2423,11 +2424,13 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, update_old = B_FALSE; } update_new = update_old; + if (GHOST_STATE(old_state)) + update_old = B_TRUE; + if (GHOST_STATE(new_state)) + update_new = B_TRUE; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); - ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); - ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -2437,12 +2440,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_old = B_TRUE; - } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { @@ -2454,12 +2451,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_new = B_TRUE; - } arc_evictable_space_increment(hdr, new_state); } } @@ -3853,7 +3844,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); + VERIFY0(remove_reference(hdr, tag)); arc_hdr_destroy(hdr); return; } @@ -3867,7 +3858,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } @@ -3894,20 +3885,20 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag) * only the evicted headers size. */ static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) +arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* @@ -3934,7 +3925,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, hdr, hash_lock); + arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. @@ -3943,7 +3934,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } @@ -3954,10 +3945,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - MSEC_TO_TICK(min_lifetime))) { + MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4022,7 +4012,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); - arc_change_state(evicted_state, hdr, hash_lock); + arc_change_state(evicted_state, hdr); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); @@ -4110,8 +4100,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (mutex_tryenter(hash_lock)) { uint64_t revicted; - uint64_t evicted = arc_evict_hdr(hdr, hash_lock, - &revicted); + uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; @@ -5444,150 +5433,137 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) /* * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + /* + * Update buffer prefetch status. + */ + boolean_t was_prefetch = HDR_PREFETCH(hdr); + boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; + if (was_prefetch != now_prefetch) { + if (was_prefetch) { + ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, + HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, + prefetch); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); + if (was_prefetch) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } + if (now_prefetch) { + if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + ARCSTAT_BUMP(arcstat_prescient_prefetch); + } else { + ARCSTAT_BUMP(arcstat_predictive_prefetch); + } + } + if (arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + + clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); + arc_change_state(arc_mru, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); + /* + * This buffer has been accessed once recently and either + * its read is still in progress or it is in the cache. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_arc_access = now; + return; + } + hdr->b_l1hdr.b_mru_hits++; + ARCSTAT_BUMP(arcstat_mru_hits); /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). + * If the previous access was a prefetch, then it already + * handled possible promotion, so nothing more to do for now. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } + if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. + * If more than ARC_MINTIME have passed from the previous + * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + arc_change_state(arc_mfu, hdr); } - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. + * This buffer has been accessed once recently, but was + * evicted from the cache. Would we have bigger MRU, it + * would be an MRU hit, so handle it the same way, except + * we don't need to check the previous access time. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + hdr->b_l1hdr.b_mru_ghost_hits++; + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + if (was_prefetch) { new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - - hdr->b_l1hdr.b_mru_ghost_hits++; - ARCSTAT_BUMP(arcstat_mru_ghost_hits); + arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. + * This buffer has been accessed more than once and either + * still in the cache or being restored from one of ghosts. */ - - hdr->b_l1hdr.b_mfu_hits++; - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; - /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. - */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; + if (!HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_mfu_hits++; + ARCSTAT_BUMP(arcstat_mfu_hits); } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - + hdr->b_l1hdr.b_arc_access = now; + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { + /* + * This buffer has been accessed more than once recently, but + * has been evicted from the cache. Would we have bigger MFU + * it would stay in cache, so move it back to MFU state. + */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* - * This buffer is on the 2nd Level ARC. + * This buffer is on the 2nd Level ARC and was not accessed + * for a long time, so treat it as new and put into MRU. */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); @@ -5630,12 +5606,12 @@ arc_buf_access(arc_buf_t *buf) hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); + ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, + !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ @@ -5768,17 +5744,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } + hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we @@ -5788,6 +5754,10 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + + /* We need the last one to call below in original order. */ + callback_list = acb; + if (!acb->acb_done || acb->acb_nobuf) continue; @@ -5851,20 +5821,15 @@ arc_read_done(zio_t *zio) */ ASSERT(callback_cnt < 2 || hash_lock != NULL); - hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) - ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || - callback_list != NULL); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); @@ -5912,8 +5877,17 @@ arc_read_done(zio_t *zio) zio_nowait(acb->acb_zio_dummy); } - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); + callback_list = acb->acb_prev; + if (acb->acb_wait) { + mutex_enter(&acb->acb_wait_lock); + acb->acb_wait_error = zio->io_error; + acb->acb_wait = B_FALSE; + cv_signal(&acb->acb_wait_cv); + mutex_exit(&acb->acb_wait_lock); + /* acb will be freed by the waiting thread. */ + } else { + kmem_free(acb, sizeof (arc_callback_t)); + } } if (freeable) @@ -6003,12 +5977,10 @@ top: */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { + boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); @@ -6016,6 +5988,7 @@ top: goto out; } + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { @@ -6029,38 +6002,28 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } + + DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. - * Synchronous reads use the b_cv whereas nowait reads - * register a callback. Both are signalled/called in - * arc_read_done. + * Synchronous reads use the acb_wait_cv whereas nowait + * reads register a callback. Both are signalled/called + * in arc_read_done. * - * Errors of the physical I/O may need to be propagated - * to the pio. For synchronous reads, we simply restart - * this function and it will reassess. Nowait reads + * Errors of the physical I/O may need to be propagated. + * Synchronous read errors are returned here from + * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ - - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; - + arc_callback_t *acb = NULL; + if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; @@ -6069,46 +6032,52 @@ top: acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; + if (*arc_flags & ARC_FLAG_WAIT) { + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, + CV_DEFAULT, NULL); + } acb->acb_zb = *zb; - if (pio != NULL) + if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); + } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; + if (hdr->b_l1hdr.b_acb) + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_iohits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, iohits); + + if (*arc_flags & ARC_FLAG_WAIT) { + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + rc = acb->acb_wait_error; + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); + } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_TRUE); + if (done && !no_buf) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } - ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ @@ -6130,8 +6099,7 @@ top: } } if (rc != 0) { - (void) remove_reference(hdr, hash_lock, - private); + (void) remove_reference(hdr, private); arc_buf_destroy_impl(buf); buf = NULL; } @@ -6139,25 +6107,12 @@ top: /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, hits); + *arc_flags |= ARC_FLAG_CACHED; if (done) done(NULL, zb, bp, buf, private); @@ -6201,7 +6156,6 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6236,21 +6190,23 @@ top: mutex_exit(hash_lock); goto top; } - - /* - * This is a delicate dance that we play here. - * This hdr might be in the ghost list so we access - * it to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); } + /* + * Call arc_adapt() explicitly before arc_access() to allow + * its logic to balance MRU/MFU based on the original state. + */ + arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); + /* + * Take additional reference for IO_IN_PROGRESS. It stops + * arc_access() from putting this header without any buffers + * and so other references but obviously nonevictable onto + * the evictable list of MRU or MFU state. + */ + add_reference(hdr, hdr); + if (!embedded_bp) + arc_access(hdr, *arc_flags, B_FALSE); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); @@ -6277,24 +6233,10 @@ top: zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -6307,7 +6249,6 @@ top: ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { @@ -6348,7 +6289,7 @@ top: blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); @@ -6370,7 +6311,8 @@ top: */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + !(l2arc_noprefetch && + (*arc_flags & ARC_FLAG_PREFETCH))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; @@ -6559,10 +6501,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp) /* * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is + * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, + * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU @@ -6579,9 +6519,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); + if (!HDR_HAS_L1HDR(hdr) || + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { @@ -6624,7 +6564,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; @@ -6684,7 +6624,7 @@ arc_release(arc_buf_t *buf, const void *tag) VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); + (void) remove_reference(hdr, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -6799,7 +6739,7 @@ arc_release(arc_buf_t *buf, const void *tag) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); @@ -6873,10 +6813,12 @@ arc_write_ready(zio_t *zio) callback->awcb_ready(zio, buf, callback->awcb_private); - if (HDR_IO_IN_PROGRESS(hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + } if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); @@ -7063,7 +7005,7 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); + arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); @@ -7083,12 +7025,14 @@ arc_write_done(zio_t *zio) } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -7303,22 +7247,32 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); + as->arcstat_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); + as->arcstat_demand_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); + as->arcstat_demand_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); + as->arcstat_prefetch_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); + as->arcstat_prefetch_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = @@ -7501,10 +7455,18 @@ arc_kstat_update(kstat_t *ksp, int rw) aggsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); + as->arcstat_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); + as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + as->arcstat_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); + as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = @@ -7736,14 +7698,19 @@ arc_state_init(void) zfs_refcount_create(&arc_l2c_only->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); + wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); @@ -7817,8 +7784,12 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_prune, 0); aggsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); + wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); @@ -7866,14 +7837,19 @@ arc_state_fini(void) multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); + wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); + wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); + wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); @@ -7947,8 +7923,12 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_prune); aggsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); + wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); @@ -9259,7 +9239,7 @@ retry: * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 377634c72..244b9b4cb 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -185,7 +185,8 @@ static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index 1d63d7de6..76b8b5608 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -517,13 +517,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } if (!have_lock) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh index f53a4ac71..fd3194fe9 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh @@ -163,6 +163,7 @@ before_clone=$(get_prop written $TESTPOOL/$TESTFS1) log_must zfs clone $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS1/snap1.clone log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/snap1.clone/testfile bs=1M \ count=40 +sync_pool after_clone=$(get_prop written $TESTPOOL/$TESTFS1) within_percent $before_clone $after_clone 99.5 || \ log_fail "unexpected written for clone $before_clone $after_clone" diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh index f2bada0eb..89ab94033 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh @@ -80,7 +80,7 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio log_must zpool export $TESTPOOL -log_must zpool import -d $VDIR $TESTPOOL +log_must zpool import -N -d $VDIR $TESTPOOL # Regardless of l2arc_noprefetch, some MFU buffers might be evicted # from ARC, accessed later on as prefetches and transition to MRU as diff --git a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index 924b56935..c97772585 100755 --- a/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -95,6 +95,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R + sync_pool $TESTPOOL verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, wait for trim, verify the vdevs are now sparse. diff --git a/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/tests/zfs-tests/tests/functional/trim/trim_config.ksh index 9a6e19e1c..6a187a05b 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -94,6 +94,7 @@ for type in "" "mirror" "raidz2" "draid"; do # Fill the pool, verify the vdevs are no longer sparse. file_write -o create -f /$TESTPOOL/file -b 1048576 -c $fill_mb -d R + sync_pool $TESTPOOL verify_vdevs "-ge" "$VDEV_MAX_MB" $VDEVS # Remove the file, issue trim, verify the vdevs are now sparse. diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index b95eca73b..2c4ef28ab 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -83,6 +83,7 @@ function do_test { # Write to zvol log_must dd if=$datafile1 of=$zvolpath conv=fsync + sync_pool # Record how much space we've used (should be 5MB, with 128k # of tolerance).