mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 19:28:53 +03:00
OpenZFS 8484 - Implement aggregate sum and use for arc counters
In pursuit of improving performance on multi-core systems, we should implements fanned out counters and use them to improve the performance of some of the arc statistics. These stats are updated extremely frequently, and can consume a significant amount of CPU time. Authored by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Dan McDonald <danmcd@joyent.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: Paul Dagnelie <pcd@delphix.com> OpenZFS-issue: https://www.illumos.org/issues/8484 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7028a8b92b7 Issue #3752 Closes #7462
This commit is contained in:
committed by
Brian Behlendorf
parent
f0ed6c7448
commit
37fb3e4318
+159
-71
@@ -303,6 +303,8 @@
|
||||
#include <zfs_fletcher.h>
|
||||
#include <sys/arc_impl.h>
|
||||
#include <sys/trace_arc.h>
|
||||
#include <sys/aggsum.h>
|
||||
#include <sys/cityhash.h>
|
||||
|
||||
#ifndef _KERNEL
|
||||
/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
|
||||
@@ -475,6 +477,7 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_c;
|
||||
kstat_named_t arcstat_c_min;
|
||||
kstat_named_t arcstat_c_max;
|
||||
/* Not updated directly; only synced in arc_kstat_update. */
|
||||
kstat_named_t arcstat_size;
|
||||
/*
|
||||
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
|
||||
@@ -503,12 +506,14 @@ typedef struct arc_stats {
|
||||
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
|
||||
* caches), and arc_buf_t structures (allocated via arc_buf_t
|
||||
* cache).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_hdr_size;
|
||||
/*
|
||||
* Number of bytes consumed by ARC buffers of type equal to
|
||||
* ARC_BUFC_DATA. This is generally consumed by buffers backing
|
||||
* on disk user data (e.g. plain file contents).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_data_size;
|
||||
/*
|
||||
@@ -516,18 +521,22 @@ typedef struct arc_stats {
|
||||
* ARC_BUFC_METADATA. This is generally consumed by buffers
|
||||
* backing on disk data that is used for internal ZFS
|
||||
* structures (e.g. ZAP, dnode, indirect blocks, etc).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_metadata_size;
|
||||
/*
|
||||
* Number of bytes consumed by dmu_buf_impl_t objects.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_dbuf_size;
|
||||
/*
|
||||
* Number of bytes consumed by dnode_t objects.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_dnode_size;
|
||||
/*
|
||||
* Number of bytes consumed by bonus buffers.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_bonus_size;
|
||||
/*
|
||||
@@ -535,6 +544,7 @@ typedef struct arc_stats {
|
||||
* arc_anon state. This includes *all* buffers in the arc_anon
|
||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||
* are all included in this value.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_anon_size;
|
||||
/*
|
||||
@@ -542,6 +552,7 @@ typedef struct arc_stats {
|
||||
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
||||
* residing in the arc_anon state, and are eligible for eviction
|
||||
* (e.g. have no outstanding holds on the buffer).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_anon_evictable_data;
|
||||
/*
|
||||
@@ -549,6 +560,7 @@ typedef struct arc_stats {
|
||||
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
||||
* residing in the arc_anon state, and are eligible for eviction
|
||||
* (e.g. have no outstanding holds on the buffer).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_anon_evictable_metadata;
|
||||
/*
|
||||
@@ -556,6 +568,7 @@ typedef struct arc_stats {
|
||||
* arc_mru state. This includes *all* buffers in the arc_mru
|
||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||
* are all included in this value.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_size;
|
||||
/*
|
||||
@@ -563,6 +576,7 @@ typedef struct arc_stats {
|
||||
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
||||
* residing in the arc_mru state, and are eligible for eviction
|
||||
* (e.g. have no outstanding holds on the buffer).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_evictable_data;
|
||||
/*
|
||||
@@ -570,6 +584,7 @@ typedef struct arc_stats {
|
||||
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
||||
* residing in the arc_mru state, and are eligible for eviction
|
||||
* (e.g. have no outstanding holds on the buffer).
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_evictable_metadata;
|
||||
/*
|
||||
@@ -580,18 +595,21 @@ typedef struct arc_stats {
|
||||
* don't actually have ARC buffers linked off of these headers.
|
||||
* Thus, *if* the headers had associated ARC buffers, these
|
||||
* buffers *would have* consumed this number of bytes.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_ghost_size;
|
||||
/*
|
||||
* Number of bytes that *would have been* consumed by ARC
|
||||
* buffers that are eligible for eviction, of type
|
||||
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_ghost_evictable_data;
|
||||
/*
|
||||
* Number of bytes that *would have been* consumed by ARC
|
||||
* buffers that are eligible for eviction, of type
|
||||
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mru_ghost_evictable_metadata;
|
||||
/*
|
||||
@@ -599,36 +617,42 @@ typedef struct arc_stats {
|
||||
* arc_mfu state. This includes *all* buffers in the arc_mfu
|
||||
* state; e.g. data, metadata, evictable, and unevictable buffers
|
||||
* are all included in this value.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_size;
|
||||
/*
|
||||
* Number of bytes consumed by ARC buffers that are eligible for
|
||||
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
|
||||
* state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_evictable_data;
|
||||
/*
|
||||
* Number of bytes consumed by ARC buffers that are eligible for
|
||||
* eviction, of type ARC_BUFC_METADATA, and reside in the
|
||||
* arc_mfu state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_evictable_metadata;
|
||||
/*
|
||||
* Total number of bytes that *would have been* consumed by ARC
|
||||
* buffers in the arc_mfu_ghost state. See the comment above
|
||||
* arcstat_mru_ghost_size for more details.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_ghost_size;
|
||||
/*
|
||||
* Number of bytes that *would have been* consumed by ARC
|
||||
* buffers that are eligible for eviction, of type
|
||||
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_ghost_evictable_data;
|
||||
/*
|
||||
* Number of bytes that *would have been* consumed by ARC
|
||||
* buffers that are eligible for eviction, of type
|
||||
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
||||
* Not updated directly; only synced in arc_kstat_update.
|
||||
*/
|
||||
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
|
||||
kstat_named_t arcstat_l2_hits;
|
||||
@@ -650,6 +674,7 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_l2_io_error;
|
||||
kstat_named_t arcstat_l2_lsize;
|
||||
kstat_named_t arcstat_l2_psize;
|
||||
/* Not updated directly; only synced in arc_kstat_update. */
|
||||
kstat_named_t arcstat_l2_hdr_size;
|
||||
kstat_named_t arcstat_memory_throttle_count;
|
||||
kstat_named_t arcstat_memory_direct_count;
|
||||
@@ -661,6 +686,7 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_tempreserve;
|
||||
kstat_named_t arcstat_loaned_bytes;
|
||||
kstat_named_t arcstat_prune;
|
||||
/* Not updated directly; only synced in arc_kstat_update. */
|
||||
kstat_named_t arcstat_meta_used;
|
||||
kstat_named_t arcstat_meta_limit;
|
||||
kstat_named_t arcstat_dnode_limit;
|
||||
@@ -829,7 +855,6 @@ static arc_state_t *arc_l2c_only;
|
||||
* the possibility of inconsistency by having shadow copies of the variables,
|
||||
* while still allowing the code to be readable.
|
||||
*/
|
||||
#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
|
||||
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
|
||||
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
|
||||
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
|
||||
@@ -840,11 +865,7 @@ static arc_state_t *arc_l2c_only;
|
||||
#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
|
||||
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
|
||||
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
|
||||
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
|
||||
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
|
||||
#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
|
||||
#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */
|
||||
#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
|
||||
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
|
||||
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
|
||||
|
||||
@@ -857,6 +878,24 @@ static arc_state_t *arc_l2c_only;
|
||||
/* number of bytes in the arc from arc_buf_t's */
|
||||
#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
|
||||
|
||||
/*
|
||||
* There are also some ARC variables that we want to export, but that are
|
||||
* updated so often that having the canonical representation be the statistic
|
||||
* variable causes a performance bottleneck. We want to use aggsum_t's for these
|
||||
* instead, but still be able to export the kstat in the same way as before.
|
||||
* The solution is to always use the aggsum version, except in the kstat update
|
||||
* callback.
|
||||
*/
|
||||
aggsum_t arc_size;
|
||||
aggsum_t arc_meta_used;
|
||||
aggsum_t astat_data_size;
|
||||
aggsum_t astat_metadata_size;
|
||||
aggsum_t astat_dbuf_size;
|
||||
aggsum_t astat_dnode_size;
|
||||
aggsum_t astat_bonus_size;
|
||||
aggsum_t astat_hdr_size;
|
||||
aggsum_t astat_l2_hdr_size;
|
||||
|
||||
static list_t arc_prune_list;
|
||||
static kmutex_t arc_prune_mtx;
|
||||
static taskq_t *arc_prune_taskq;
|
||||
@@ -1050,21 +1089,15 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
|
||||
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
|
||||
static void l2arc_read_done(zio_t *);
|
||||
|
||||
|
||||
/*
|
||||
* We use Cityhash for this. It's fast, and has good hash properties without
|
||||
* requiring any large static buffers.
|
||||
*/
|
||||
static uint64_t
|
||||
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
|
||||
{
|
||||
uint8_t *vdva = (uint8_t *)dva;
|
||||
uint64_t crc = -1ULL;
|
||||
int i;
|
||||
|
||||
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
||||
|
||||
for (i = 0; i < sizeof (dva_t); i++)
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
|
||||
|
||||
crc ^= (spa>>8) ^ birth;
|
||||
|
||||
return (crc);
|
||||
return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
|
||||
}
|
||||
|
||||
#define HDR_EMPTY(hdr) \
|
||||
@@ -2676,32 +2709,32 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
||||
default:
|
||||
break;
|
||||
case ARC_SPACE_DATA:
|
||||
ARCSTAT_INCR(arcstat_data_size, space);
|
||||
aggsum_add(&astat_data_size, space);
|
||||
break;
|
||||
case ARC_SPACE_META:
|
||||
ARCSTAT_INCR(arcstat_metadata_size, space);
|
||||
aggsum_add(&astat_metadata_size, space);
|
||||
break;
|
||||
case ARC_SPACE_BONUS:
|
||||
ARCSTAT_INCR(arcstat_bonus_size, space);
|
||||
aggsum_add(&astat_bonus_size, space);
|
||||
break;
|
||||
case ARC_SPACE_DNODE:
|
||||
ARCSTAT_INCR(arcstat_dnode_size, space);
|
||||
aggsum_add(&astat_dnode_size, space);
|
||||
break;
|
||||
case ARC_SPACE_DBUF:
|
||||
ARCSTAT_INCR(arcstat_dbuf_size, space);
|
||||
aggsum_add(&astat_dbuf_size, space);
|
||||
break;
|
||||
case ARC_SPACE_HDRS:
|
||||
ARCSTAT_INCR(arcstat_hdr_size, space);
|
||||
aggsum_add(&astat_hdr_size, space);
|
||||
break;
|
||||
case ARC_SPACE_L2HDRS:
|
||||
ARCSTAT_INCR(arcstat_l2_hdr_size, space);
|
||||
aggsum_add(&astat_l2_hdr_size, space);
|
||||
break;
|
||||
}
|
||||
|
||||
if (type != ARC_SPACE_DATA)
|
||||
ARCSTAT_INCR(arcstat_meta_used, space);
|
||||
aggsum_add(&arc_meta_used, space);
|
||||
|
||||
atomic_add_64(&arc_size, space);
|
||||
aggsum_add(&arc_size, space);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -2713,37 +2746,42 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
||||
default:
|
||||
break;
|
||||
case ARC_SPACE_DATA:
|
||||
ARCSTAT_INCR(arcstat_data_size, -space);
|
||||
aggsum_add(&astat_data_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_META:
|
||||
ARCSTAT_INCR(arcstat_metadata_size, -space);
|
||||
aggsum_add(&astat_metadata_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_BONUS:
|
||||
ARCSTAT_INCR(arcstat_bonus_size, -space);
|
||||
aggsum_add(&astat_bonus_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_DNODE:
|
||||
ARCSTAT_INCR(arcstat_dnode_size, -space);
|
||||
aggsum_add(&astat_dnode_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_DBUF:
|
||||
ARCSTAT_INCR(arcstat_dbuf_size, -space);
|
||||
aggsum_add(&astat_dbuf_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_HDRS:
|
||||
ARCSTAT_INCR(arcstat_hdr_size, -space);
|
||||
aggsum_add(&astat_hdr_size, -space);
|
||||
break;
|
||||
case ARC_SPACE_L2HDRS:
|
||||
ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
|
||||
aggsum_add(&astat_l2_hdr_size, -space);
|
||||
break;
|
||||
}
|
||||
|
||||
if (type != ARC_SPACE_DATA) {
|
||||
ASSERT(arc_meta_used >= space);
|
||||
if (arc_meta_max < arc_meta_used)
|
||||
arc_meta_max = arc_meta_used;
|
||||
ARCSTAT_INCR(arcstat_meta_used, -space);
|
||||
ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
|
||||
/*
|
||||
* We use the upper bound here rather than the precise value
|
||||
* because the arc_meta_max value doesn't need to be
|
||||
* precise. It's only consumed by humans via arcstats.
|
||||
*/
|
||||
if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
|
||||
arc_meta_max = aggsum_upper_bound(&arc_meta_used);
|
||||
aggsum_add(&arc_meta_used, -space);
|
||||
}
|
||||
|
||||
ASSERT(arc_size >= space);
|
||||
atomic_add_64(&arc_size, -space);
|
||||
ASSERT(aggsum_compare(&arc_size, space) >= 0);
|
||||
aggsum_add(&arc_size, -space);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4073,9 +4111,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
|
||||
* Request that 10% of the LRUs be scanned by the superblock
|
||||
* shrinker.
|
||||
*/
|
||||
if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
|
||||
arc_prune_async((arc_dnode_size - arc_dnode_limit) /
|
||||
sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);
|
||||
if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
|
||||
arc_dnode_limit) > 0) {
|
||||
arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
|
||||
arc_dnode_limit) / sizeof (dnode_t) /
|
||||
zfs_arc_dnode_reduce_percent);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start eviction using a randomly selected sublist,
|
||||
@@ -4257,14 +4298,14 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
|
||||
*
|
||||
* Therefore, this function has been updated to make alternating passes
|
||||
* over the ARC releasing data buffers and then newly unheld meta data
|
||||
* buffers. This ensures forward progress is maintained and arc_meta_used
|
||||
* buffers. This ensures forward progress is maintained and meta_used
|
||||
* will decrease. Normally this is sufficient, but if required the ARC
|
||||
* will call the registered prune callbacks causing dentry and inodes to
|
||||
* be dropped from the VFS cache. This will make dnode meta data buffers
|
||||
* available for reclaim.
|
||||
*/
|
||||
static uint64_t
|
||||
arc_adjust_meta_balanced(void)
|
||||
arc_adjust_meta_balanced(uint64_t meta_used)
|
||||
{
|
||||
int64_t delta, prune = 0, adjustmnt;
|
||||
uint64_t total_evicted = 0;
|
||||
@@ -4280,7 +4321,7 @@ restart:
|
||||
* metadata from the MFU. I think we probably need to implement a
|
||||
* "metadata arc_p" value to do this properly.
|
||||
*/
|
||||
adjustmnt = arc_meta_used - arc_meta_limit;
|
||||
adjustmnt = meta_used - arc_meta_limit;
|
||||
|
||||
if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
|
||||
delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
|
||||
@@ -4305,7 +4346,7 @@ restart:
|
||||
total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
|
||||
}
|
||||
|
||||
adjustmnt = arc_meta_used - arc_meta_limit;
|
||||
adjustmnt = meta_used - arc_meta_limit;
|
||||
|
||||
if (adjustmnt > 0 &&
|
||||
refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
|
||||
@@ -4329,7 +4370,7 @@ restart:
|
||||
* meta buffers. Requests to the upper layers will be made with
|
||||
* increasingly large scan sizes until the ARC is below the limit.
|
||||
*/
|
||||
if (arc_meta_used > arc_meta_limit) {
|
||||
if (meta_used > arc_meta_limit) {
|
||||
if (type == ARC_BUFC_DATA) {
|
||||
type = ARC_BUFC_METADATA;
|
||||
} else {
|
||||
@@ -4354,7 +4395,7 @@ restart:
|
||||
* capped by the arc_meta_limit tunable.
|
||||
*/
|
||||
static uint64_t
|
||||
arc_adjust_meta_only(void)
|
||||
arc_adjust_meta_only(uint64_t meta_used)
|
||||
{
|
||||
uint64_t total_evicted = 0;
|
||||
int64_t target;
|
||||
@@ -4366,7 +4407,7 @@ arc_adjust_meta_only(void)
|
||||
* we're over the meta limit more than we're over arc_p, we
|
||||
* evict some from the MRU here, and some from the MFU below.
|
||||
*/
|
||||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
||||
target = MIN((int64_t)(meta_used - arc_meta_limit),
|
||||
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
||||
refcount_count(&arc_mru->arcs_size) - arc_p));
|
||||
|
||||
@@ -4377,8 +4418,9 @@ arc_adjust_meta_only(void)
|
||||
* below the meta limit, but not so much as to drop us below the
|
||||
* space allotted to the MFU (which is defined as arc_c - arc_p).
|
||||
*/
|
||||
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
||||
(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
|
||||
target = MIN((int64_t)(meta_used - arc_meta_limit),
|
||||
(int64_t)(refcount_count(&arc_mfu->arcs_size) -
|
||||
(arc_c - arc_p)));
|
||||
|
||||
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
||||
|
||||
@@ -4386,12 +4428,12 @@ arc_adjust_meta_only(void)
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
arc_adjust_meta(void)
|
||||
arc_adjust_meta(uint64_t meta_used)
|
||||
{
|
||||
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
|
||||
return (arc_adjust_meta_only());
|
||||
return (arc_adjust_meta_only(meta_used));
|
||||
else
|
||||
return (arc_adjust_meta_balanced());
|
||||
return (arc_adjust_meta_balanced(meta_used));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4478,12 +4520,14 @@ arc_adjust(void)
|
||||
uint64_t total_evicted = 0;
|
||||
uint64_t bytes;
|
||||
int64_t target;
|
||||
uint64_t asize = aggsum_value(&arc_size);
|
||||
uint64_t ameta = aggsum_value(&arc_meta_used);
|
||||
|
||||
/*
|
||||
* If we're over arc_meta_limit, we want to correct that before
|
||||
* potentially evicting data buffers below.
|
||||
*/
|
||||
total_evicted += arc_adjust_meta();
|
||||
total_evicted += arc_adjust_meta(ameta);
|
||||
|
||||
/*
|
||||
* Adjust MRU size
|
||||
@@ -4495,9 +4539,9 @@ arc_adjust(void)
|
||||
* the MRU is over arc_p, we'll evict enough to get back to
|
||||
* arc_p here, and then evict more from the MFU below.
|
||||
*/
|
||||
target = MIN((int64_t)(arc_size - arc_c),
|
||||
target = MIN((int64_t)(asize - arc_c),
|
||||
(int64_t)(refcount_count(&arc_anon->arcs_size) +
|
||||
refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
|
||||
refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
|
||||
|
||||
/*
|
||||
* If we're below arc_meta_min, always prefer to evict data.
|
||||
@@ -4508,7 +4552,7 @@ arc_adjust(void)
|
||||
* type, spill over into the next type.
|
||||
*/
|
||||
if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
|
||||
arc_meta_used > arc_meta_min) {
|
||||
ameta > arc_meta_min) {
|
||||
bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
||||
total_evicted += bytes;
|
||||
|
||||
@@ -4541,10 +4585,10 @@ arc_adjust(void)
|
||||
* size back to arc_p, if we're still above the target cache
|
||||
* size, we evict the rest from the MFU.
|
||||
*/
|
||||
target = arc_size - arc_c;
|
||||
target = asize - arc_c;
|
||||
|
||||
if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
|
||||
arc_meta_used > arc_meta_min) {
|
||||
ameta > arc_meta_min) {
|
||||
bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
||||
total_evicted += bytes;
|
||||
|
||||
@@ -4645,13 +4689,14 @@ arc_flush(spa_t *spa, boolean_t retry)
|
||||
void
|
||||
arc_shrink(int64_t to_free)
|
||||
{
|
||||
uint64_t asize = aggsum_value(&arc_size);
|
||||
uint64_t c = arc_c;
|
||||
|
||||
if (c > to_free && c - to_free > arc_c_min) {
|
||||
arc_c = c - to_free;
|
||||
atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
|
||||
if (arc_c > arc_size)
|
||||
arc_c = MAX(arc_size, arc_c_min);
|
||||
if (asize < arc_c)
|
||||
arc_c = MAX(asize, arc_c_min);
|
||||
if (arc_p > arc_c)
|
||||
arc_p = (arc_c >> 1);
|
||||
ASSERT(arc_c >= arc_c_min);
|
||||
@@ -4660,7 +4705,7 @@ arc_shrink(int64_t to_free)
|
||||
arc_c = arc_c_min;
|
||||
}
|
||||
|
||||
if (arc_size > arc_c)
|
||||
if (asize > arc_c)
|
||||
(void) arc_adjust();
|
||||
}
|
||||
|
||||
@@ -4877,7 +4922,8 @@ arc_kmem_reap_now(void)
|
||||
extern kmem_cache_t *range_seg_cache;
|
||||
|
||||
#ifdef _KERNEL
|
||||
if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
|
||||
if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
|
||||
zfs_arc_meta_prune) {
|
||||
/*
|
||||
* We are exceeding our meta-data cache limit.
|
||||
* Prune some entries to release holds on meta-data.
|
||||
@@ -5022,7 +5068,7 @@ arc_reclaim_thread(void *unused)
|
||||
* be helpful and could potentially cause us to enter an
|
||||
* infinite loop.
|
||||
*/
|
||||
if (arc_size <= arc_c || evicted == 0) {
|
||||
if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
|
||||
/*
|
||||
* We're either no longer overflowing, or we
|
||||
* can't evict anything more, so we should wake
|
||||
@@ -5101,12 +5147,13 @@ arc_reclaim_thread(void *unused)
|
||||
static uint64_t
|
||||
arc_evictable_memory(void)
|
||||
{
|
||||
int64_t asize = aggsum_value(&arc_size);
|
||||
uint64_t arc_clean =
|
||||
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
|
||||
refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
|
||||
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
|
||||
refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
|
||||
uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
|
||||
uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
|
||||
|
||||
/*
|
||||
* Scale reported evictable memory in proportion to page cache, cap
|
||||
@@ -5118,7 +5165,7 @@ arc_evictable_memory(void)
|
||||
if (arc_dirty >= min)
|
||||
return (arc_clean);
|
||||
|
||||
return (MAX((int64_t)arc_size - (int64_t)min, 0));
|
||||
return (MAX((int64_t)asize - (int64_t)min, 0));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5261,7 +5308,8 @@ arc_adapt(int bytes, arc_state_t *state)
|
||||
* cache size, increment the target cache size
|
||||
*/
|
||||
ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
|
||||
if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
|
||||
if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >=
|
||||
0) {
|
||||
atomic_add_64(&arc_c, (int64_t)bytes);
|
||||
if (arc_c > arc_c_max)
|
||||
arc_c = arc_c_max;
|
||||
@@ -5284,7 +5332,16 @@ arc_is_overflowing(void)
|
||||
uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
|
||||
arc_c >> zfs_arc_overflow_shift);
|
||||
|
||||
return (arc_size >= arc_c + overflow);
|
||||
/*
|
||||
* We just compare the lower bound here for performance reasons. Our
|
||||
* primary goals are to make sure that the arc never grows without
|
||||
* bound, and that it can reach its maximum size. This check
|
||||
* accomplishes both goals. The maximum amount we could run over by is
|
||||
* 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
|
||||
* in the ARC. In practice, that's in the tens of MB, which is low
|
||||
* enough to be safe.
|
||||
*/
|
||||
return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
|
||||
}
|
||||
|
||||
static abd_t *
|
||||
@@ -5399,7 +5456,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
|
||||
* If we are growing the cache, and we are adding anonymous
|
||||
* data, and we have outgrown arc_p, update arc_p
|
||||
*/
|
||||
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
|
||||
if (aggsum_compare(&arc_size, arc_c) < 0 &&
|
||||
hdr->b_l1hdr.b_state == arc_anon &&
|
||||
(refcount_count(&arc_anon->arcs_size) +
|
||||
refcount_count(&arc_mru->arcs_size) > arc_p))
|
||||
arc_p = MIN(arc_c, arc_p + size);
|
||||
@@ -7213,6 +7271,17 @@ arc_kstat_update(kstat_t *ksp, int rw)
|
||||
&as->arcstat_mfu_ghost_evictable_data,
|
||||
&as->arcstat_mfu_ghost_evictable_metadata);
|
||||
|
||||
ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
|
||||
ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
|
||||
ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
|
||||
ARCSTAT(arcstat_metadata_size) =
|
||||
aggsum_value(&astat_metadata_size);
|
||||
ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
|
||||
ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
|
||||
ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
|
||||
ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
|
||||
ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
|
||||
|
||||
as->arcstat_memory_all_bytes.value.ui64 =
|
||||
arc_all_memory();
|
||||
as->arcstat_memory_free_bytes.value.ui64 =
|
||||
@@ -7424,6 +7493,16 @@ arc_state_init(void)
|
||||
refcount_create(&arc_mfu_ghost->arcs_size);
|
||||
refcount_create(&arc_l2c_only->arcs_size);
|
||||
|
||||
aggsum_init(&arc_meta_used, 0);
|
||||
aggsum_init(&arc_size, 0);
|
||||
aggsum_init(&astat_data_size, 0);
|
||||
aggsum_init(&astat_metadata_size, 0);
|
||||
aggsum_init(&astat_hdr_size, 0);
|
||||
aggsum_init(&astat_l2_hdr_size, 0);
|
||||
aggsum_init(&astat_bonus_size, 0);
|
||||
aggsum_init(&astat_dnode_size, 0);
|
||||
aggsum_init(&astat_dbuf_size, 0);
|
||||
|
||||
arc_anon->arcs_state = ARC_STATE_ANON;
|
||||
arc_mru->arcs_state = ARC_STATE_MRU;
|
||||
arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
|
||||
@@ -7465,6 +7544,16 @@ arc_state_fini(void)
|
||||
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
|
||||
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
|
||||
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
|
||||
|
||||
aggsum_fini(&arc_meta_used);
|
||||
aggsum_fini(&arc_size);
|
||||
aggsum_fini(&astat_data_size);
|
||||
aggsum_fini(&astat_metadata_size);
|
||||
aggsum_fini(&astat_hdr_size);
|
||||
aggsum_fini(&astat_l2_hdr_size);
|
||||
aggsum_fini(&astat_bonus_size);
|
||||
aggsum_fini(&astat_dnode_size);
|
||||
aggsum_fini(&astat_dbuf_size);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
@@ -7516,7 +7605,6 @@ arc_init(void)
|
||||
|
||||
arc_c = arc_c_max;
|
||||
arc_p = (arc_c >> 1);
|
||||
arc_size = 0;
|
||||
|
||||
/* Set min to 1/2 of arc_c_min */
|
||||
arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
|
||||
|
||||
Reference in New Issue
Block a user