From da8ccd0ee0d4eed08200e706cb2ca1da3bdfb5cf Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Mon, 30 Dec 2013 09:30:00 -0800 Subject: [PATCH] Prioritize "metadata" in arc_get_data_buf When the arc is at it's size limit and a new buffer is added, data will be evicted (or recycled) from the arc to make room for this new buffer. As far as I can tell, this is to try and keep the arc from over stepping it's bounds (i.e. keep it below the size limitation placed on it). This makes sense conceptually, but there appears to be a subtle flaw in its current implementation, resulting in metadata buffers being throttled. When it evicts from the arc's lists, it also passes in a "type" so as to remove a buffer of the same type that it is adding. The problem with this is that once the size limit is hit, the ratio of "metadata" to "data" contained in the arc essentially becomes fixed. For example, consider the following scenario: * the size of the arc is capped at 10G * the meta_limit is capped at 4G * 9G of the arc contains "data" * 1G of the arc contains "metadata" Now, every time a new "metadata" buffer is created and added to the arc, an older "metadata" buffer(s) will be removed from the arc; preserving the 9G "data" to 1G "metadata" ratio that was in-place when the size limit was reached. This occurs even though the amount of "metadata" is far below the "metadata" limit. This can result in the arc behaving pathologically for certain workloads. To fix this, the arc_get_data_buf function was modified to evict "data" from the arc even when adding a "metadata" buffer; unless it's at the "metadata" limit. In addition, arc_evict now more closely resembles arc_evict_ghost; such that when evicting "data" from the arc, it may make a second pass over the arc lists and evict "metadata" if it cannot meet the eviction size the first time around. Signed-off-by: Prakash Surya Signed-off-by: Brian Behlendorf Issue #2110 --- module/zfs/arc.c | 62 +++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f28748423..242d0c8c5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1890,6 +1890,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; +top: mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); @@ -2005,6 +2006,15 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_evicted < bytes)) { + /* Prevent second pass from recycling metadata into data */ + recycle = FALSE; + type = ARC_BUFC_METADATA; + list = &state->arcs_list[type]; + goto top; + } + if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); @@ -2146,16 +2156,9 @@ arc_adjust(void) adjustment = MIN((int64_t)(arc_size - arc_c), (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + if (adjustment > 0 && arc_mru->arcs_size > 0) { + delta = MIN(arc_mru->arcs_size, adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2164,17 +2167,9 @@ arc_adjust(void) adjustment = arc_size - arc_c; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + if (adjustment > 0 && arc_mfu->arcs_size > 0) { + delta = MIN(arc_mfu->arcs_size, adjustment); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } - - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t delta = MIN(adjustment, - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, - ARC_BUFC_METADATA); } /* @@ -2752,6 +2747,8 @@ arc_get_data_buf(arc_buf_t *buf) arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t evict = ARC_BUFC_DATA; + boolean_t recycle = TRUE; arc_adapt(size, state); @@ -2792,7 +2789,24 @@ arc_get_data_buf(arc_buf_t *buf) mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { + /* + * Evict data buffers prior to metadata buffers, unless we're + * over the metadata limit and adding a metadata buffer. + */ + if (type == ARC_BUFC_METADATA) { + if (arc_meta_used >= arc_meta_limit) + evict = ARC_BUFC_METADATA; + else + /* + * In this case, we're evicting data while + * adding metadata. Thus, to prevent recycling a + * data buffer into a metadata buffer, recycling + * is disabled in the following arc_evict call. + */ + recycle = FALSE; + } + + if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); @@ -2803,8 +2817,10 @@ arc_get_data_buf(arc_buf_t *buf) * via the prune callback to drop references. The * prune callback in run in the context of the reclaim * thread to avoid deadlocking on the hash_lock. + * Of course, only do this when recycle is true. */ - cv_signal(&arc_reclaim_thr_cv); + if (recycle) + cv_signal(&arc_reclaim_thr_cv); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); @@ -2812,7 +2828,9 @@ arc_get_data_buf(arc_buf_t *buf) atomic_add_64(&arc_size, size); } - ARCSTAT_BUMP(arcstat_recycle_miss); + /* Only bump this if we tried to recycle and failed */ + if (recycle) + ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: