More adaptive ARC eviction

Traditionally ARC adaptation was limited to MRU/MFU distribution.  But
for years people with metadata-centric workload demanded mechanisms to
also manage data/metadata distribution, that in original ZFS was just
a FIFO.  As result ZFS effectively got separate states for data and
metadata, minimum and maximum metadata limits etc, but it all required
manual tuning, was not adaptive and in its heart remained a bad FIFO.

This change removes most of existing eviction logic, rewriting it from
scratch.  This makes MRU/MFU adaptation individual for data and meta-
data, same as the distribution between data and metadata themselves.
Since most of required states separation was already done, it only
required to make arcs_size state field specific per data/metadata.

The adaptation logic is still based on previous concept of ghost hits,
just now it balances ARC capacity between 4 states: MRU data, MRU
metadata, MFU data and MFU metadata.  To simplify arc_c changes instead
of arc_p measured in bytes, this code uses 3 variable arc_meta, arc_pd
and arc_pm, representing ARC balance between metadata and data, MRU and
MFU for data, and MRU and MFU for metadata respectively as 32-bit fixed
point fractions.  Since we care about the math result only when need to
evict, this moves all the logic from arc_adapt() to arc_evict(), that
reduces per-block overhead, since per-block operations are limited to
stats collection, now moved from arc_adapt() to arc_access() and using
cheaper wmsums.  This also allows to remove ugly ARC_HDR_DO_ADAPT flag
from many places.

This change also removes number of metadata specific tunables, part of
which were actually not functioning correctly, since not all metadata
are equal and some (like L2ARC headers) are not really evictable.
Instead it introduced single opaque knob zfs_arc_meta_balance, tuning
ARC's reaction on ghost hits, allowing administrator give more or less
preference to metadata without setting strict limits.

Some of old code parts like arc_evict_meta() are just removed, because
since introduction of ABD ARC they really make no sense: only headers
referenced by small number of buffers are not evictable, and they are
really not evictable no matter what this code do.  Instead just call
arc_prune_async() if too much metadata appear not evictable.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #14359
This commit is contained in:
Alexander Motin
2023-03-08 14:17:23 -05:00
committed by GitHub
parent 8d9752569b
commit a8d83e2a24
10 changed files with 474 additions and 781 deletions
+71 -30
View File
@@ -270,16 +270,14 @@ def draw_graph(kstats_dict):
arc_perc = f_perc(arc_stats['size'], arc_stats['c_max'])
mfu_size = f_bytes(arc_stats['mfu_size'])
mru_size = f_bytes(arc_stats['mru_size'])
meta_limit = f_bytes(arc_stats['arc_meta_limit'])
meta_size = f_bytes(arc_stats['arc_meta_used'])
dnode_limit = f_bytes(arc_stats['arc_dnode_limit'])
dnode_size = f_bytes(arc_stats['dnode_size'])
info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) '
'DNODE {6} ({7})')
info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} '
'DNODE {5} ({6})')
info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size,
meta_size, meta_limit, dnode_size,
dnode_limit)
meta_size, dnode_size, dnode_limit)
info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2)
info_line = GRAPH_INDENT+info_spc+info_line
@@ -558,16 +556,28 @@ def section_arc(kstats_dict):
arc_target_size = arc_stats['c']
arc_max = arc_stats['c_max']
arc_min = arc_stats['c_min']
anon_size = arc_stats['anon_size']
mfu_size = arc_stats['mfu_size']
mru_size = arc_stats['mru_size']
mfug_size = arc_stats['mfu_ghost_size']
mrug_size = arc_stats['mru_ghost_size']
unc_size = arc_stats['uncached_size']
meta_limit = arc_stats['arc_meta_limit']
meta_size = arc_stats['arc_meta_used']
meta = arc_stats['meta']
pd = arc_stats['pd']
pm = arc_stats['pm']
anon_data = arc_stats['anon_data']
anon_metadata = arc_stats['anon_metadata']
mfu_data = arc_stats['mfu_data']
mfu_metadata = arc_stats['mfu_metadata']
mru_data = arc_stats['mru_data']
mru_metadata = arc_stats['mru_metadata']
mfug_data = arc_stats['mfu_ghost_data']
mfug_metadata = arc_stats['mfu_ghost_metadata']
mrug_data = arc_stats['mru_ghost_data']
mrug_metadata = arc_stats['mru_ghost_metadata']
unc_data = arc_stats['uncached_data']
unc_metadata = arc_stats['uncached_metadata']
bonus_size = arc_stats['bonus_size']
dnode_limit = arc_stats['arc_dnode_limit']
dnode_size = arc_stats['dnode_size']
dbuf_size = arc_stats['dbuf_size']
hdr_size = arc_stats['hdr_size']
l2_hdr_size = arc_stats['l2_hdr_size']
abd_chunk_waste_size = arc_stats['abd_chunk_waste_size']
target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min))
prt_2('ARC size (current):',
@@ -578,25 +588,56 @@ def section_arc(kstats_dict):
f_perc(arc_min, arc_max), f_bytes(arc_min))
prt_i2('Max size (high water):',
target_size_ratio, f_bytes(arc_max))
caches_size = int(anon_size)+int(mfu_size)+int(mru_size)+int(unc_size)
prt_i2('Anonymouns data size:',
f_perc(anon_size, caches_size), f_bytes(anon_size))
prt_i2('Most Frequently Used (MFU) cache size:',
f_perc(mfu_size, caches_size), f_bytes(mfu_size))
prt_i2('Most Recently Used (MRU) cache size:',
f_perc(mru_size, caches_size), f_bytes(mru_size))
prt_i1('Most Frequently Used (MFU) ghost size:', f_bytes(mfug_size))
prt_i1('Most Recently Used (MRU) ghost size:', f_bytes(mrug_size))
caches_size = int(anon_data)+int(anon_metadata)+\
int(mfu_data)+int(mfu_metadata)+int(mru_data)+int(mru_metadata)+\
int(unc_data)+int(unc_metadata)
prt_i2('Anonymous data size:',
f_perc(anon_data, caches_size), f_bytes(anon_data))
prt_i2('Anonymous metadata size:',
f_perc(anon_metadata, caches_size), f_bytes(anon_metadata))
s = 4294967296
v = (s-int(pd))*(s-int(meta))/s
prt_i2('MFU data target:', f_perc(v, s),
f_bytes(v / 65536 * caches_size / 65536))
prt_i2('MFU data size:',
f_perc(mfu_data, caches_size), f_bytes(mfu_data))
prt_i1('MFU ghost data size:', f_bytes(mfug_data))
v = (s-int(pm))*int(meta)/s
prt_i2('MFU metadata target:', f_perc(v, s),
f_bytes(v / 65536 * caches_size / 65536))
prt_i2('MFU metadata size:',
f_perc(mfu_metadata, caches_size), f_bytes(mfu_metadata))
prt_i1('MFU ghost metadata size:', f_bytes(mfug_metadata))
v = int(pd)*(s-int(meta))/s
prt_i2('MRU data target:', f_perc(v, s),
f_bytes(v / 65536 * caches_size / 65536))
prt_i2('MRU data size:',
f_perc(mru_data, caches_size), f_bytes(mru_data))
prt_i1('MRU ghost data size:', f_bytes(mrug_data))
v = int(pm)*int(meta)/s
prt_i2('MRU metadata target:', f_perc(v, s),
f_bytes(v / 65536 * caches_size / 65536))
prt_i2('MRU metadata size:',
f_perc(mru_metadata, caches_size), f_bytes(mru_metadata))
prt_i1('MRU ghost metadata size:', f_bytes(mrug_metadata))
prt_i2('Uncached data size:',
f_perc(unc_size, caches_size), f_bytes(unc_size))
prt_i2('Metadata cache size (hard limit):',
f_perc(meta_limit, arc_max), f_bytes(meta_limit))
prt_i2('Metadata cache size (current):',
f_perc(meta_size, meta_limit), f_bytes(meta_size))
prt_i2('Dnode cache size (hard limit):',
f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit))
prt_i2('Dnode cache size (current):',
f_perc(unc_data, caches_size), f_bytes(unc_data))
prt_i2('Uncached metadata size:',
f_perc(unc_metadata, caches_size), f_bytes(unc_metadata))
prt_i2('Bonus size:',
f_perc(bonus_size, arc_size), f_bytes(bonus_size))
prt_i2('Dnode cache target:',
f_perc(dnode_limit, arc_max), f_bytes(dnode_limit))
prt_i2('Dnode cache size:',
f_perc(dnode_size, dnode_limit), f_bytes(dnode_size))
prt_i2('Dbuf size:',
f_perc(dbuf_size, arc_size), f_bytes(dbuf_size))
prt_i2('Header size:',
f_perc(hdr_size, arc_size), f_bytes(hdr_size))
prt_i2('L2 header size:',
f_perc(l2_hdr_size, arc_size), f_bytes(l2_hdr_size))
prt_i2('ABD chunk waste size:',
f_perc(abd_chunk_waste_size, arc_size), f_bytes(abd_chunk_waste_size))
print()
print('ARC hash breakdown:')