diff --git a/cmd/arc_summary b/cmd/arc_summary index 714962946..5d10e903f 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -270,16 +270,14 @@ def draw_graph(kstats_dict): arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) mfu_size = f_bytes(arc_stats['mfu_size']) mru_size = f_bytes(arc_stats['mru_size']) - meta_limit = f_bytes(arc_stats['arc_meta_limit']) meta_size = f_bytes(arc_stats['arc_meta_used']) dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) dnode_size = f_bytes(arc_stats['dnode_size']) - info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' - 'DNODE {6} ({7})') + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ' + 'DNODE {5} ({6})') info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, - meta_size, meta_limit, dnode_size, - dnode_limit) + meta_size, dnode_size, dnode_limit) info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) info_line = GRAPH_INDENT+info_spc+info_line @@ -558,16 +556,28 @@ def section_arc(kstats_dict): arc_target_size = arc_stats['c'] arc_max = arc_stats['c_max'] arc_min = arc_stats['c_min'] - anon_size = arc_stats['anon_size'] - mfu_size = arc_stats['mfu_size'] - mru_size = arc_stats['mru_size'] - mfug_size = arc_stats['mfu_ghost_size'] - mrug_size = arc_stats['mru_ghost_size'] - unc_size = arc_stats['uncached_size'] - meta_limit = arc_stats['arc_meta_limit'] - meta_size = arc_stats['arc_meta_used'] + meta = arc_stats['meta'] + pd = arc_stats['pd'] + pm = arc_stats['pm'] + anon_data = arc_stats['anon_data'] + anon_metadata = arc_stats['anon_metadata'] + mfu_data = arc_stats['mfu_data'] + mfu_metadata = arc_stats['mfu_metadata'] + mru_data = arc_stats['mru_data'] + mru_metadata = arc_stats['mru_metadata'] + mfug_data = arc_stats['mfu_ghost_data'] + mfug_metadata = arc_stats['mfu_ghost_metadata'] + mrug_data = arc_stats['mru_ghost_data'] + mrug_metadata = arc_stats['mru_ghost_metadata'] + unc_data = arc_stats['uncached_data'] + unc_metadata = arc_stats['uncached_metadata'] + bonus_size = arc_stats['bonus_size'] dnode_limit = arc_stats['arc_dnode_limit'] dnode_size = arc_stats['dnode_size'] + dbuf_size = arc_stats['dbuf_size'] + hdr_size = arc_stats['hdr_size'] + l2_hdr_size = arc_stats['l2_hdr_size'] + abd_chunk_waste_size = arc_stats['abd_chunk_waste_size'] target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) prt_2('ARC size (current):', @@ -578,25 +588,56 @@ def section_arc(kstats_dict): f_perc(arc_min, arc_max), f_bytes(arc_min)) prt_i2('Max size (high water):', target_size_ratio, f_bytes(arc_max)) - caches_size = int(anon_size)+int(mfu_size)+int(mru_size)+int(unc_size) - prt_i2('Anonymouns data size:', - f_perc(anon_size, caches_size), f_bytes(anon_size)) - prt_i2('Most Frequently Used (MFU) cache size:', - f_perc(mfu_size, caches_size), f_bytes(mfu_size)) - prt_i2('Most Recently Used (MRU) cache size:', - f_perc(mru_size, caches_size), f_bytes(mru_size)) - prt_i1('Most Frequently Used (MFU) ghost size:', f_bytes(mfug_size)) - prt_i1('Most Recently Used (MRU) ghost size:', f_bytes(mrug_size)) + caches_size = int(anon_data)+int(anon_metadata)+\ + int(mfu_data)+int(mfu_metadata)+int(mru_data)+int(mru_metadata)+\ + int(unc_data)+int(unc_metadata) + prt_i2('Anonymous data size:', + f_perc(anon_data, caches_size), f_bytes(anon_data)) + prt_i2('Anonymous metadata size:', + f_perc(anon_metadata, caches_size), f_bytes(anon_metadata)) + s = 4294967296 + v = (s-int(pd))*(s-int(meta))/s + prt_i2('MFU data target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MFU data size:', + f_perc(mfu_data, caches_size), f_bytes(mfu_data)) + prt_i1('MFU ghost data size:', f_bytes(mfug_data)) + v = (s-int(pm))*int(meta)/s + prt_i2('MFU metadata target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MFU metadata size:', + f_perc(mfu_metadata, caches_size), f_bytes(mfu_metadata)) + prt_i1('MFU ghost metadata size:', f_bytes(mfug_metadata)) + v = int(pd)*(s-int(meta))/s + prt_i2('MRU data target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MRU data size:', + f_perc(mru_data, caches_size), f_bytes(mru_data)) + prt_i1('MRU ghost data size:', f_bytes(mrug_data)) + v = int(pm)*int(meta)/s + prt_i2('MRU metadata target:', f_perc(v, s), + f_bytes(v / 65536 * caches_size / 65536)) + prt_i2('MRU metadata size:', + f_perc(mru_metadata, caches_size), f_bytes(mru_metadata)) + prt_i1('MRU ghost metadata size:', f_bytes(mrug_metadata)) prt_i2('Uncached data size:', - f_perc(unc_size, caches_size), f_bytes(unc_size)) - prt_i2('Metadata cache size (hard limit):', - f_perc(meta_limit, arc_max), f_bytes(meta_limit)) - prt_i2('Metadata cache size (current):', - f_perc(meta_size, meta_limit), f_bytes(meta_size)) - prt_i2('Dnode cache size (hard limit):', - f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) - prt_i2('Dnode cache size (current):', + f_perc(unc_data, caches_size), f_bytes(unc_data)) + prt_i2('Uncached metadata size:', + f_perc(unc_metadata, caches_size), f_bytes(unc_metadata)) + prt_i2('Bonus size:', + f_perc(bonus_size, arc_size), f_bytes(bonus_size)) + prt_i2('Dnode cache target:', + f_perc(dnode_limit, arc_max), f_bytes(dnode_limit)) + prt_i2('Dnode cache size:', f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) + prt_i2('Dbuf size:', + f_perc(dbuf_size, arc_size), f_bytes(dbuf_size)) + prt_i2('Header size:', + f_perc(hdr_size, arc_size), f_bytes(hdr_size)) + prt_i2('L2 header size:', + f_perc(l2_hdr_size, arc_size), f_bytes(l2_hdr_size)) + prt_i2('ABD chunk waste size:', + f_perc(abd_chunk_waste_size, arc_size), f_bytes(abd_chunk_waste_size)) print() print('ARC hash breakdown:') diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c6198ee26..329562418 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -117,7 +117,6 @@ zdb_ot_name(dmu_object_type_t type) extern int reference_tracking_enable; extern int zfs_recover; -extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; extern uint_t zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; @@ -8809,8 +8808,8 @@ main(int argc, char **argv) * ZDB does not typically re-read blocks; therefore limit the ARC * to 256 MB, which can be used entirely for metadata. */ - zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; - zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; + zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = 256 * 1024 * 1024; #endif /* diff --git a/include/sys/arc.h b/include/sys/arc.h index 2b4f16ee0..836ed679d 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -200,7 +200,6 @@ struct arc_buf { }; typedef enum arc_buf_contents { - ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 721b05023..fd24d2f3c 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -82,15 +82,18 @@ typedef struct arc_state { * supports the "dbufs" kstat */ arc_state_type_t arcs_state; + /* + * total amount of data in this state. + */ + zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + * amount of hit bytes for this state (counted only for ghost states) */ - zfs_refcount_t arcs_size; + wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -358,8 +361,9 @@ typedef struct l2arc_lb_ptr_buf { #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) -#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) -#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +/* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ +#define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) @@ -582,7 +586,9 @@ typedef struct arc_stats { kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; + kstat_named_t arcstat_meta; + kstat_named_t arcstat_pd; + kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; @@ -655,6 +661,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_data; + kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -676,6 +684,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_data; + kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -700,6 +710,8 @@ typedef struct arc_stats { * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_data; + kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -719,6 +731,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_data; + kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu @@ -737,6 +751,8 @@ typedef struct arc_stats { * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_data; + kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -754,6 +770,8 @@ typedef struct arc_stats { * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; + kstat_named_t arcstat_uncached_data; + kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. @@ -876,10 +894,7 @@ typedef struct arc_stats { kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; @@ -942,7 +957,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - aggsum_t arcstat_dnode_size; + wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; @@ -987,7 +1002,7 @@ typedef struct arc_sums { wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; - aggsum_t arcstat_meta_used; + wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; @@ -1015,7 +1030,9 @@ typedef struct arc_evict_waiter { #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ +#define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ +#define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 88a044f63..e8e2cfec6 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -558,14 +558,6 @@ This value acts as a ceiling to the amount of dnode metadata, and defaults to which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. -.Pp -Also see -.Sy zfs_arc_meta_prune -which serves a similar purpose but is used -when the amount of metadata in the ARC exceeds -.Sy zfs_arc_meta_limit -rather than in response to overall demand for non-metadata. -. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp @@ -648,62 +640,10 @@ It cannot be set back to while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . -.It Sy zfs_arc_meta_adjust_restarts Ns = Ns Sy 4096 Pq uint -The number of restart passes to make while scanning the ARC attempting -the free buffers in order to stay below the -.Sy fs_arc_meta_limit . -This value should not need to be tuned but is available to facilitate -performance analysis. -. -.It Sy zfs_arc_meta_limit Ns = Ns Sy 0 Ns B Pq u64 -The maximum allowed size in bytes that metadata buffers are allowed to -consume in the ARC. -When this limit is reached, metadata buffers will be reclaimed, -even if the overall -.Sy arc_c_max -has not been reached. -It defaults to -.Sy 0 , -which indicates that a percentage based on -.Sy zfs_arc_meta_limit_percent -of the ARC may be used for metadata. -.Pp -This value my be changed dynamically, except that must be set to an explicit -value -.Pq cannot be set back to Sy 0 . -. -.It Sy zfs_arc_meta_limit_percent Ns = Ns Sy 75 Ns % Pq u64 -Percentage of ARC buffers that can be used for metadata. -.Pp -See also -.Sy zfs_arc_meta_limit , -which serves a similar purpose but has a higher priority if nonzero. -. -.It Sy zfs_arc_meta_min Ns = Ns Sy 0 Ns B Pq u64 -The minimum allowed size in bytes that metadata buffers may consume in -the ARC. -. -.It Sy zfs_arc_meta_prune Ns = Ns Sy 10000 Pq int -The number of dentries and inodes to be scanned looking for entries -which can be dropped. -This may be required when the ARC reaches the -.Sy zfs_arc_meta_limit -because dentries and inodes can pin buffers in the ARC. -Increasing this value will cause to dentry and inode caches -to be pruned more aggressively. -Setting this value to -.Sy 0 -will disable pruning the inode and dentry caches. -. -.It Sy zfs_arc_meta_strategy Ns = Ns Sy 1 Ns | Ns 0 Pq uint -Define the strategy for ARC metadata buffer eviction (meta reclaim strategy): -.Bl -tag -compact -offset 4n -width "0 (META_ONLY)" -.It Sy 0 Pq META_ONLY -evict only the ARC metadata buffers -.It Sy 1 Pq BALANCED -additional data buffers may be evicted if required -to evict the required number of metadata buffers. -.El +.It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint +Balance between metadata and data on ghost hits. +Values above 100 increase metadata caching by proportionally reducing effect +of ghost data hits on target data/metadata rate. . .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. @@ -786,20 +726,6 @@ causes the ARC to start reclamation if it exceeds the target size by of the target size, and block allocations by .Em 0.6% . . -.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq uint -If nonzero, this will update -.Sy arc_p_min_shift Pq default Sy 4 -with the new value. -.Sy arc_p_min_shift No is used as a shift of Sy arc_c -when calculating the minumum -.Sy arc_p No size . -. -.It Sy zfs_arc_p_dampener_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Disable -.Sy arc_p -adapt dampener, which reduces the maximum single adjustment to -.Sy arc_p . -. .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index dfe5c3d31..a2ff0f386 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -159,7 +159,7 @@ arc_prune_task(void *arg) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index bd6cfc86c..35edea0a2 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -359,89 +359,114 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, "No reads during writes (LEGACY)"); /* END CSTYLED */ +static int +param_get_arc_state_size(SYSCTL_HANDLER_ARGS) +{ + arc_state_t *state = (arc_state_t *)arg1; + int64_t val; + + val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + return (sysctl_handle_64(oidp, &val, 0, req)); +} + extern arc_state_t ARC_anon; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, - &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_anon, 0, param_get_arc_state_size, "Q", + "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in anonymous state"); + "size of evictable metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in anonymous state"); + "size of evictable data in anonymous state"); /* END CSTYLED */ extern arc_state_t ARC_mru; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, - &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru, 0, param_get_arc_state_size, "Q", + "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru state"); + "size of evictable metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru state"); + "size of evictable data in mru state"); /* END CSTYLED */ extern arc_state_t ARC_mru_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, - &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", + "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mru ghost state"); + "size of evictable metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mru ghost state"); + "size of evictable data in mru ghost state"); /* END CSTYLED */ extern arc_state_t ARC_mfu; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, - &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu, 0, param_get_arc_state_size, "Q", + "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu state"); + "size of evictable metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu state"); + "size of evictable data in mfu state"); /* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", + "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in mfu ghost state"); + "size of evictable metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in mfu ghost state"); + "size of evictable data in mfu ghost state"); /* END CSTYLED */ extern arc_state_t ARC_uncached; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD, - &ARC_uncached.arcs_size.rc_count, 0, "size of uncached state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_uncached, 0, param_get_arc_state_size, "Q", + "size of uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of metadata in uncached state"); + "size of evictable metadata in uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of data in uncached state"); + "size of evictable data in uncached state"); /* END CSTYLED */ extern arc_state_t ARC_l2c_only; /* BEGIN CSTYLED */ -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, - &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + &ARC_l2c_only, 0, param_get_arc_state_size, "Q", + "size of l2c_only state"); /* END CSTYLED */ /* dbuf.c */ diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 6f730e9dd..b7d605352 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -504,7 +504,7 @@ arc_prune_task(void *ptr) /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * honor the metadata limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called diff --git a/module/zfs/arc.c b/module/zfs/arc.c index d851e919e..aff438777 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -108,12 +108,11 @@ * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In + * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. + * they can be reclaimed. For example, when using the ZPL each dentry + * holds a references on a znode. These dentries must be pruned before + * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. @@ -377,9 +376,6 @@ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; -/* shift of arc_c for calculating both min and max arc_p */ -static uint_t arc_p_min_shift = 4; - /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; @@ -422,13 +418,10 @@ boolean_t arc_warm; */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; -uint64_t zfs_arc_meta_limit = 0; -uint64_t zfs_arc_meta_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; -static uint_t zfs_arc_p_min_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* @@ -447,10 +440,11 @@ static const unsigned long zfs_arc_pool_dirty_percent = 20; int zfs_compressed_arc_enabled = B_TRUE; /* - * ARC will evict meta buffers that exceed arc_meta_limit. This - * tunable make arc_meta_limit adjustable for different workloads. + * Balance between metadata and data on ghost hits. Values above 100 + * increase metadata caching by proportionally reducing effect of ghost + * data hits on target data/metadata rate. */ -static uint64_t zfs_arc_meta_limit_percent = 75; +static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. @@ -463,10 +457,6 @@ static uint_t zfs_arc_dnode_limit_percent = 10; static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; -static int zfs_arc_p_dampener_disable = 1; -static uint_t zfs_arc_meta_prune = 10000; -static uint_t zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; -static uint_t zfs_arc_meta_adjust_restarts = 4096; static uint_t zfs_arc_lotsfree_percent = 10; /* @@ -520,7 +510,9 @@ arc_stats_t arc_stats = { { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, + { "meta", KSTAT_DATA_UINT64 }, + { "pd", KSTAT_DATA_UINT64 }, + { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, @@ -538,21 +530,33 @@ arc_stats_t arc_stats = { { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_data", KSTAT_DATA_UINT64 }, + { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_data", KSTAT_DATA_UINT64 }, + { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_data", KSTAT_DATA_UINT64 }, + { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, + { "uncached_data", KSTAT_DATA_UINT64 }, + { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, @@ -607,10 +611,7 @@ arc_stats_t arc_stats = { { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, @@ -683,10 +684,7 @@ static kstat_t *arc_ksp; */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -/* max size for dnodes */ -#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; @@ -859,7 +857,6 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, - ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; @@ -1875,7 +1872,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -1902,8 +1899,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -2420,7 +2416,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -2465,7 +2461,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { - multilist_remove(&old_state->arcs_list[buftype], + multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } @@ -2478,7 +2474,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(&new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } @@ -2501,7 +2497,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * the reference. As a result, we use the arc * header pointer for the reference. */ - (void) zfs_refcount_add_many(&new_state->arcs_size, + (void) zfs_refcount_add_many( + &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2529,20 +2526,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } @@ -2563,7 +2560,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) * header on the ghost state. */ - (void) zfs_refcount_remove_many(&old_state->arcs_size, + (void) zfs_refcount_remove_many( + &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; @@ -2589,8 +2587,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) continue; (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); + &old_state->arcs_size[type], + arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -2598,14 +2596,14 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), - hdr); + &old_state->arcs_size[type], + arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, HDR_GET_PSIZE(hdr), - hdr); + &old_state->arcs_size[type], + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2639,7 +2637,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, space); + ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2662,7 +2660,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) - aggsum_add(&arc_sums.arcstat_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } @@ -2685,7 +2683,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, -space); + ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -2701,13 +2699,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) break; } - if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { - ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, - space) >= 0); - ARCSTAT_MAX(arcstat_meta_max, - aggsum_upper_bound(&arc_sums.arcstat_meta_used)); - aggsum_add(&arc_sums.arcstat_meta_used, -space); - } + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); @@ -2974,7 +2967,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { @@ -3007,7 +3000,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, @@ -3036,7 +3030,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); @@ -3537,7 +3532,7 @@ arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) /* unset all members of the original hdr */ memset(&hdr->b_dva, 0, sizeof (dva_t)); hdr->b_birth = 0; - hdr->b_type = ARC_BUFC_INVALID; + hdr->b_type = 0; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; @@ -4195,8 +4190,7 @@ arc_state_alloc_markers(int count) /* * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_type() and - * arc_evict_state_impl(). + * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; @@ -4226,8 +4220,8 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count) * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, - arc_buf_contents_t type) +arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, + uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; @@ -4265,19 +4259,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; - /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare( - &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { - arc_prune_async((aggsum_upper_bound( - &arc_sums.arcstat_dnode_size) - - arc_dnode_size_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all @@ -4362,7 +4343,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; @@ -4372,252 +4353,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, } /* - * Evict the specified number of bytes from the state specified, - * restricting eviction to the spa and type given. This function - * prevents us from trying to evict more from a state's list than - * is "evictable", and to skip evicting altogether when passed a + * Evict the specified number of bytes from the state specified. This + * function prevents us from trying to evict more from a state's list + * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t -arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); - return (arc_evict_state(state, spa, delta, type)); + return (arc_evict_state(state, type, 0, delta)); } return (0); } /* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. + * Adjust specified fraction, taking into account initial ghost state(s) size, + * ghost hit bytes towards increasing the fraction, ghost hit bytes towards + * decreasing it, plus a balance factor, controlling the decrease rate, used + * to balance metadata vs data. */ static uint64_t -arc_evict_meta_balanced(uint64_t meta_used) +arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, + uint_t balance) { - int64_t delta, adjustmnt; - uint64_t total_evicted = 0, prune = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - uint_t restarts = zfs_arc_meta_adjust_restarts; - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_evict because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; - } + if (total < 8 || up + down == 0) + return (frac); /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. + * We should not have more ghost hits than ghost size, but they + * may get close. Restrict maximum adjustment in that case. */ - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); + if (up + down >= total / 4) { + uint64_t scale = (up + down) / (total / 8); + up /= scale; + down /= scale; } - adjustmnt = meta_used - arc_meta_limit; + /* Get maximal dynamic range by choosing optimal shifts. */ + int s = highbit64(total); + s = MIN(64 - s, 32); - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } + uint64_t ofrac = (1ULL << 32) - frac; - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); - } + if (frac >= 4 * ofrac) + up /= frac / (2 * ofrac + 1); + up = (up << s) / (total >> (32 - s)); + if (ofrac >= 4 * frac) + down /= ofrac / (2 * frac + 1); + down = (down << s) / (total >> (32 - s)); + down = down * 100 / balance; - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit || arc_available_memory() < 0) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arcstat_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_evict_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -static uint64_t -arc_evict_meta(uint64_t meta_used) -{ - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_evict_meta_only(meta_used)); - else - return (arc_evict_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_evict_type(arc_state_t *state) -{ - multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; - - /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). - */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; - } - - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); - - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } - - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); - - return (type); + return (frac + up - down); } /* @@ -4626,150 +4419,128 @@ arc_evict_type(arc_state_t *state) static uint64_t arc_evict(void) { - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); + uint64_t asize, bytes, total_evicted = 0; + int64_t e, mrud, mrum, mfud, mfum, w; + static uint64_t ogrd, ogrm, ogfd, ogfm; + static uint64_t gsrd, gsrm, gsfd, gsfm; + uint64_t ngrd, ngrm, ngfd, ngfm; - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_evict_meta(ameta); + /* Get current size of ARC states we can evict from. */ + mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); + mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + uint64_t d = mrud + mfud; + uint64_t m = mrum + mfum; + uint64_t t = d + m; - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); + /* Get ARC ghost hits since last eviction. */ + ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t grd = ngrd - ogrd; + ogrd = ngrd; + ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t grm = ngrm - ogrm; + ogrm = ngrm; + ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t gfd = ngfd - ogfd; + ogfd = ngfd; + ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t gfm = ngfm - ogfm; + ogfm = ngfm; - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; + /* Adjust ARC states balance based on ghost hits. */ + arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, + grm + gfm, grd + gfd, zfs_arc_meta_balance); + arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); + arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } - - /* - * Re-sum ARC stats after the first round of evictions. - */ asize = aggsum_value(&arc_sums.arcstat_size); - ameta = aggsum_value(&arc_sums.arcstat_meta_used); - + int64_t wt = t - (asize - arc_c); /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. + * Try to reduce pinned dnodes if more than 3/4 of wanted metadata + * target is not evictable or if they go over arc_dnode_limit. */ - target = asize - arc_c; - - if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + int64_t prune = 0; + int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + w = wt * (arc_meta >> 16) >> 16; + if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > + w * 3 / 4) { + prune = dn / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } else if (dn > arc_dnode_limit) { + prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; } + if (prune > 0) + arc_prune_async(prune); - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; + /* Evict MRU metadata. */ + w = wt * (arc_meta * arc_pm >> 48) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); + total_evicted += bytes; + mrum -= bytes; + asize -= bytes; - bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + /* Evict MFU metadata. */ + w = wt * (arc_meta >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); + total_evicted += bytes; + mfum -= bytes; + asize -= bytes; + + /* Evict MRU data. */ + wt -= m - total_evicted; + w = wt * (arc_pd >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); + total_evicted += bytes; + mrud -= bytes; + asize -= bytes; + + /* Evict MFU data. */ + e = asize - arc_c; + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); + mfud -= bytes; total_evicted += bytes; - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: + * Evict ghost lists * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c + * Size of each state's ghost list represents how much that state + * may grow by shrinking the other states. Would it need to shrink + * other states to zero (that is unlikely), its ghost size would be + * equal to sum of other three state sizes. But excessive ghost + * size may result in false ghost hits (too far back), that may + * never result in real cache hits if several states are competing. + * So choose some arbitraty point of 1/2 of other state sizes. */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; + gsrd = (mrum + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - + gsrd; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); - bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; + gsrm = (mrud + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsrm; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); - target -= bytes; + gsfd = (mrud + mrum + mfum) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - + gsfd; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); - total_evicted += - arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + gsfm = (mrud + mrum + mfud) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsfm; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } @@ -4808,7 +4579,10 @@ arc_flush(spa_t *spa, boolean_t retry) void arc_reduce_target_size(int64_t to_free) { - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t c = arc_c; + + if (c <= arc_c_min) + return; /* * All callers want the ARC to actually evict (at least) this much @@ -4818,26 +4592,16 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t c = MIN(arc_c, asize); + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + if (asize < c) + to_free += c - asize; + arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); - if (c > to_free && c - to_free > arc_c_min) { - arc_c = c - to_free; - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } else { - arc_c = arc_c_min; - } - - if (asize > arc_c) { - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); - } + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } /* @@ -4859,14 +4623,6 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL - if ((aggsum_compare(&arc_sums.arcstat_meta_used, - arc_meta_limit) >= 0) && zfs_arc_meta_prune) { - /* - * We are exceeding our meta-data cache limit. - * Prune some entries to release holds on meta-data. - */ - arc_prune_async(zfs_arc_meta_prune); - } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. @@ -5143,40 +4899,8 @@ arc_reap_cb(void *arg, zthr_t *zthr) * when we are adding new content to the cache. */ static void -arc_adapt(int bytes, arc_state_t *state) +arc_adapt(uint64_t bytes) { - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + (uint64_t)bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - /* * Wake reap thread if we do not have any available memory */ @@ -5195,18 +4919,12 @@ arc_adapt(int bytes, arc_state_t *state) * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ - ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); - if (aggsum_upper_bound(&arc_sums.arcstat_size) >= - arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) + if (aggsum_upper_bound(&arc_sums.arcstat_size) + + 2 * SPA_MAXBLOCKSIZE >= arc_c) { + uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); + if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; - else if (state == arc_anon && arc_p < arc_c >> 1) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; } - ASSERT((int64_t)arc_p >= 0); } /* @@ -5255,7 +4973,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); + arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5353,11 +5071,7 @@ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (alloc_flags & ARC_HDR_DO_ADAPT) - arc_adapt(size, state); + arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data @@ -5375,7 +5089,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); - VERIFY3U(hdr->b_type, ==, type); + arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { @@ -5386,9 +5100,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ + arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); + (void) zfs_refcount_add_many(&state->arcs_size[type], size, + tag); /* * If this is reached via arc_read, the link is @@ -5404,17 +5120,6 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p && - arc_p < arc_c >> 1)) - arc_p = MIN(arc_c, arc_p + size); } } @@ -5457,7 +5162,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -5570,6 +5275,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); @@ -5597,6 +5304,8 @@ arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { @@ -6156,6 +5865,7 @@ top: uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) @@ -6170,7 +5880,6 @@ top: * embedded data. */ arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); @@ -6229,11 +5938,6 @@ top: alloc_flags |= ARC_HDR_ALLOC_LINEAR; } - /* - * Call arc_adapt() explicitly before arc_access() to allow - * its logic to balance MRU/MFU based on the original state. - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers @@ -6706,7 +6410,7 @@ arc_release(arc_buf_t *buf, const void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6728,7 +6432,7 @@ arc_release(arc_buf_t *buf, const void *tag) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many(&state->arcs_size, + (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { @@ -6766,7 +6470,7 @@ arc_release(arc_buf_t *buf, const void *tag) (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; - (void) zfs_refcount_add_many(&arc_anon->arcs_size, + (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); @@ -6922,7 +6626,7 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || @@ -6935,19 +6639,17 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -7202,7 +6904,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - + anon_size = MAX((int64_t) + (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* @@ -7258,9 +6962,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - size->value.ui64 = zfs_refcount_count(&state->arcs_size); + data->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); + metadata->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = @@ -7360,37 +7069,49 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - aggsum_value(&arc_sums.arcstat_dnode_size) + + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, + &as->arcstat_anon_data, + &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, + &as->arcstat_mru_data, + &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_data, + &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, + &as->arcstat_mfu_data, + &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_data, + &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, + &as->arcstat_uncached_data, + &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - aggsum_value(&arc_sums.arcstat_dnode_size); + wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7488,7 +7209,7 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = - aggsum_value(&arc_sums.arcstat_meta_used); + wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = @@ -7574,7 +7295,6 @@ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); - unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && @@ -7591,44 +7311,15 @@ arc_tuning_update(boolean_t verbose) (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); - arc_p = (arc_c >> 1); - if (arc_meta_limit > arc_c_max) - arc_meta_limit = arc_c_max; - if (arc_dnode_size_limit > arc_meta_limit) - arc_dnode_size_limit = arc_meta_limit; + if (arc_dnode_limit > arc_c_max) + arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); - /* Valid range: 16M - */ - if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && - (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && - (zfs_arc_meta_min <= arc_c_max)) { - arc_meta_min = zfs_arc_meta_min; - if (arc_meta_limit < arc_meta_min) - arc_meta_limit = arc_meta_min; - if (arc_dnode_size_limit < arc_meta_min) - arc_dnode_size_limit = arc_meta_min; - } - WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); - - /* Valid range: - */ - limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : - MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; - if ((limit != arc_meta_limit) && - (limit >= arc_meta_min) && - (limit <= arc_c_max)) - arc_meta_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); - - /* Valid range: - */ - limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : - MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; - if ((limit != arc_dnode_size_limit) && - (limit >= arc_meta_min) && - (limit <= arc_meta_limit)) - arc_dnode_size_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, - verbose); + /* Valid range: 0 - */ + arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : + MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) @@ -7640,10 +7331,6 @@ arc_tuning_update(boolean_t verbose) arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } - /* Valid range: 1 - N */ - if (zfs_arc_p_min_shift) - arc_p_min_shift = zfs_arc_p_min_shift; - /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; @@ -7732,13 +7419,25 @@ arc_state_init(void) zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); - zfs_refcount_create(&arc_uncached->arcs_size); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); + + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); @@ -7781,7 +7480,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - aggsum_init(&arc_sums.arcstat_dnode_size, 0); + wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7826,7 +7525,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); - aggsum_init(&arc_sums.arcstat_meta_used, 0); + wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); @@ -7865,13 +7564,20 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); - zfs_refcount_destroy(&arc_uncached->arcs_size); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); @@ -7886,6 +7592,11 @@ arc_state_fini(void) multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); @@ -7927,7 +7638,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - aggsum_fini(&arc_sums.arcstat_dnode_size); + wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); @@ -7972,7 +7683,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); - aggsum_fini(&arc_sums.arcstat_meta_used); + wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); @@ -8044,18 +7755,16 @@ arc_init(void) #endif arc_c = arc_c_min; - arc_p = (arc_c >> 1); - - /* Set min to 1/2 of arc_c_min */ - arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* - * Set arc_meta_limit to a percent of arc_c_max with a floor of - * arc_meta_min, and a ceiling of arc_c_max. + * 32-bit fixed point fractions of metadata from total ARC size, + * MRU data from all data and MRU metadata from all metadata. */ - percent = MIN(zfs_arc_meta_limit_percent, 100); - arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); + arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ + arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ + arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ + percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_size_limit = (percent * arc_meta_limit) / 100; + arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); @@ -8832,7 +8541,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8869,7 +8578,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -9763,7 +9472,7 @@ l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); - return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } @@ -10653,7 +10362,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ - arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* @@ -11113,40 +10822,18 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Metadata limit for ARC size in bytes"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, - param_set_arc_int, param_get_uint, ZMOD_RW, - "Percent of ARC size for ARC meta limit"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_u64, - spl_param_get_u64, ZMOD_RW, "Minimum ARC metadata size in bytes"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, - "Meta objects to scan for prune"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, UINT, ZMOD_RW, - "Limit number of restarts in arc_evict_meta"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, UINT, ZMOD_RW, - "Meta reclaim strategy"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, + "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, - "Disable arc_p adapt dampener"); - ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, - param_get_uint, ZMOD_RW, "arc_c shift to calc min/max arc_p"); - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); diff --git a/tests/zfs-tests/tests/perf/perf.shlib b/tests/zfs-tests/tests/perf/perf.shlib index 27c40bd52..5555e910d 100644 --- a/tests/zfs-tests/tests/perf/perf.shlib +++ b/tests/zfs-tests/tests/perf/perf.shlib @@ -485,7 +485,6 @@ function get_system_config printf " \"tunables\": {\n" >>$config for tunable in \ zfs_arc_max \ - zfs_arc_meta_limit \ zfs_arc_sys_free \ zfs_dirty_data_max \ zfs_flags \