mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 18:04:22 +03:00
Make arc_prune() asynchronous
As described in the comment above arc_adapt_thread() it is critical that the arc_adapt_thread() function never sleep while holding a hash lock. This behavior was possible in the Linux implementation because the arc_prune() logic was implemented to be synchronous. Under illumos the analogous dnlc_reduce_cache() function is asynchronous. To address this the arc_do_user_prune() function is has been reworked in to two new functions as follows: * arc_prune_async() is an asynchronous implementation which dispatches the prune callback to be run by the system taskq. This makes it suitable to use in the context of the arc_adapt_thread(). * arc_prune() is a synchronous implementation which depends on the arc_prune_async() implementation but blocks until the outstanding callbacks complete. This is used in arc_kmem_reap_now() where it is safe, and expected, that memory will be freed. This patch additionally adds the zfs_arc_meta_strategy module option while allows the meta reclaim strategy to be configured. It defaults to a balanced strategy which has been proved to work well under Linux but the illumos meta-only strategy can be enabled. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
parent
c5528b9ba6
commit
f604673836
@ -59,10 +59,16 @@ arc_done_func_t arc_getbuf_func;
|
|||||||
struct arc_prune {
|
struct arc_prune {
|
||||||
arc_prune_func_t *p_pfunc;
|
arc_prune_func_t *p_pfunc;
|
||||||
void *p_private;
|
void *p_private;
|
||||||
|
uint64_t p_adjust;
|
||||||
list_node_t p_node;
|
list_node_t p_node;
|
||||||
refcount_t p_refcnt;
|
refcount_t p_refcnt;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef enum arc_strategy {
|
||||||
|
ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
|
||||||
|
ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
|
||||||
|
} arc_strategy_t;
|
||||||
|
|
||||||
typedef enum arc_flags
|
typedef enum arc_flags
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
155
module/zfs/arc.c
155
module/zfs/arc.c
@ -167,6 +167,9 @@ static boolean_t arc_user_evicts_thread_exit;
|
|||||||
/* number of objects to prune from caches when arc_meta_limit is reached */
|
/* number of objects to prune from caches when arc_meta_limit is reached */
|
||||||
int zfs_arc_meta_prune = 10000;
|
int zfs_arc_meta_prune = 10000;
|
||||||
|
|
||||||
|
/* The preferred strategy to employ when arc_meta_limit is reached */
|
||||||
|
int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
|
||||||
|
|
||||||
typedef enum arc_reclaim_strategy {
|
typedef enum arc_reclaim_strategy {
|
||||||
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
|
ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
|
||||||
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
|
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
|
||||||
@ -531,6 +534,7 @@ static arc_state_t *arc_l2c_only;
|
|||||||
|
|
||||||
static list_t arc_prune_list;
|
static list_t arc_prune_list;
|
||||||
static kmutex_t arc_prune_mtx;
|
static kmutex_t arc_prune_mtx;
|
||||||
|
static taskq_t *arc_prune_taskq;
|
||||||
static arc_buf_t *arc_eviction_list;
|
static arc_buf_t *arc_eviction_list;
|
||||||
static arc_buf_hdr_t arc_eviction_hdr;
|
static arc_buf_hdr_t arc_eviction_hdr;
|
||||||
|
|
||||||
@ -2430,47 +2434,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Request that arc user drop references so that N bytes can be released
|
* Helper function for arc_prune() it is responsible for safely handling
|
||||||
* from the cache. This provides a mechanism to ensure the arc can honor
|
* the execution of a registered arc_prune_func_t.
|
||||||
* the arc_meta_limit and reclaim buffers which are pinned in the cache
|
|
||||||
* by higher layers. (i.e. the zpl)
|
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
arc_do_user_prune(int64_t adjustment)
|
arc_prune_task(void *ptr)
|
||||||
{
|
{
|
||||||
arc_prune_func_t *func;
|
arc_prune_t *ap = (arc_prune_t *)ptr;
|
||||||
void *private;
|
arc_prune_func_t *func = ap->p_pfunc;
|
||||||
arc_prune_t *cp, *np;
|
|
||||||
|
if (func != NULL)
|
||||||
|
func(ap->p_adjust, ap->p_private);
|
||||||
|
|
||||||
|
/* Callback unregistered concurrently with execution */
|
||||||
|
if (refcount_remove(&ap->p_refcnt, func) == 0) {
|
||||||
|
ASSERT(!list_link_active(&ap->p_node));
|
||||||
|
refcount_destroy(&ap->p_refcnt);
|
||||||
|
kmem_free(ap, sizeof (*ap));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notify registered consumers they must drop holds on a portion of the ARC
|
||||||
|
* buffered they reference. This provides a mechanism to ensure the ARC can
|
||||||
|
* honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
|
||||||
|
* is analogous to dnlc_reduce_cache() but more generic.
|
||||||
|
*
|
||||||
|
* This operation is performed asyncronously so it may be safely called
|
||||||
|
* in the context of the arc_adapt_thread(). A reference is taken here
|
||||||
|
* for each registered arc_prune_t and the arc_prune_task() is responsible
|
||||||
|
* for releasing it once the registered arc_prune_func_t has completed.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
arc_prune_async(int64_t adjust)
|
||||||
|
{
|
||||||
|
arc_prune_t *ap;
|
||||||
|
|
||||||
mutex_enter(&arc_prune_mtx);
|
mutex_enter(&arc_prune_mtx);
|
||||||
|
for (ap = list_head(&arc_prune_list); ap != NULL;
|
||||||
|
ap = list_next(&arc_prune_list, ap)) {
|
||||||
|
|
||||||
cp = list_head(&arc_prune_list);
|
if (refcount_count(&ap->p_refcnt) >= 2)
|
||||||
while (cp != NULL) {
|
continue;
|
||||||
func = cp->p_pfunc;
|
|
||||||
private = cp->p_private;
|
|
||||||
np = list_next(&arc_prune_list, cp);
|
|
||||||
refcount_add(&cp->p_refcnt, func);
|
|
||||||
mutex_exit(&arc_prune_mtx);
|
|
||||||
|
|
||||||
if (func != NULL)
|
refcount_add(&ap->p_refcnt, ap->p_pfunc);
|
||||||
func(adjustment, private);
|
ap->p_adjust = adjust;
|
||||||
|
taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP);
|
||||||
mutex_enter(&arc_prune_mtx);
|
ARCSTAT_BUMP(arcstat_prune);
|
||||||
|
|
||||||
/* User removed prune callback concurrently with execution */
|
|
||||||
if (refcount_remove(&cp->p_refcnt, func) == 0) {
|
|
||||||
ASSERT(!list_link_active(&cp->p_node));
|
|
||||||
refcount_destroy(&cp->p_refcnt);
|
|
||||||
kmem_free(cp, sizeof (*cp));
|
|
||||||
}
|
|
||||||
|
|
||||||
cp = np;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ARCSTAT_BUMP(arcstat_prune);
|
|
||||||
mutex_exit(&arc_prune_mtx);
|
mutex_exit(&arc_prune_mtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
arc_prune(int64_t adjust)
|
||||||
|
{
|
||||||
|
arc_prune_async(adjust);
|
||||||
|
taskq_wait_outstanding(arc_prune_taskq, 0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Evict the specified number of bytes from the state specified,
|
* Evict the specified number of bytes from the state specified,
|
||||||
* restricting eviction to the spa and type given. This function
|
* restricting eviction to the spa and type given. This function
|
||||||
@ -2511,7 +2532,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
|
|||||||
* available for reclaim.
|
* available for reclaim.
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
arc_adjust_meta(void)
|
arc_adjust_meta_balanced(void)
|
||||||
{
|
{
|
||||||
int64_t adjustmnt, delta, prune = 0;
|
int64_t adjustmnt, delta, prune = 0;
|
||||||
uint64_t total_evicted = 0;
|
uint64_t total_evicted = 0;
|
||||||
@ -2580,7 +2601,7 @@ restart:
|
|||||||
|
|
||||||
if (zfs_arc_meta_prune) {
|
if (zfs_arc_meta_prune) {
|
||||||
prune += zfs_arc_meta_prune;
|
prune += zfs_arc_meta_prune;
|
||||||
arc_do_user_prune(prune);
|
arc_prune_async(prune);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2592,6 +2613,50 @@ restart:
|
|||||||
return (total_evicted);
|
return (total_evicted);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Evict metadata buffers from the cache, such that arc_meta_used is
|
||||||
|
* capped by the arc_meta_limit tunable.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
arc_adjust_meta_only(void)
|
||||||
|
{
|
||||||
|
uint64_t total_evicted = 0;
|
||||||
|
int64_t target;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we're over the meta limit, we want to evict enough
|
||||||
|
* metadata to get back under the meta limit. We don't want to
|
||||||
|
* evict so much that we drop the MRU below arc_p, though. If
|
||||||
|
* we're over the meta limit more than we're over arc_p, we
|
||||||
|
* evict some from the MRU here, and some from the MFU below.
|
||||||
|
*/
|
||||||
|
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
||||||
|
(int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
|
||||||
|
|
||||||
|
total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Similar to the above, we want to evict enough bytes to get us
|
||||||
|
* below the meta limit, but not so much as to drop us below the
|
||||||
|
* space alloted to the MFU (which is defined as arc_c - arc_p).
|
||||||
|
*/
|
||||||
|
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
|
||||||
|
(int64_t)(arc_mfu->arcs_size - (arc_c - arc_p)));
|
||||||
|
|
||||||
|
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
|
||||||
|
|
||||||
|
return (total_evicted);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t
|
||||||
|
arc_adjust_meta(void)
|
||||||
|
{
|
||||||
|
if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
|
||||||
|
return (arc_adjust_meta_only());
|
||||||
|
else
|
||||||
|
return (arc_adjust_meta_balanced());
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the type of the oldest buffer in the given arc state
|
* Return the type of the oldest buffer in the given arc state
|
||||||
*
|
*
|
||||||
@ -2905,6 +2970,14 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
|
|||||||
extern kmem_cache_t *zio_buf_cache[];
|
extern kmem_cache_t *zio_buf_cache[];
|
||||||
extern kmem_cache_t *zio_data_buf_cache[];
|
extern kmem_cache_t *zio_data_buf_cache[];
|
||||||
|
|
||||||
|
if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
|
||||||
|
/*
|
||||||
|
* We are exceeding our meta-data cache limit.
|
||||||
|
* Prune some entries to release holds on meta-data.
|
||||||
|
*/
|
||||||
|
arc_prune(zfs_arc_meta_prune);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* An aggressive reclamation will shrink the cache size as well as
|
* An aggressive reclamation will shrink the cache size as well as
|
||||||
* reap free buffers from the arc kmem caches.
|
* reap free buffers from the arc kmem caches.
|
||||||
@ -2929,15 +3002,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Unlike other ZFS implementations this thread is only responsible for
|
|
||||||
* adapting the target ARC size on Linux. The responsibility for memory
|
|
||||||
* reclamation has been entirely delegated to the arc_shrinker_func()
|
|
||||||
* which is registered with the VM. To reflect this change in behavior
|
|
||||||
* the arc_reclaim thread has been renamed to arc_adapt.
|
|
||||||
*
|
|
||||||
* The following comment from arc_reclaim_thread() in illumos is still
|
|
||||||
* applicable:
|
|
||||||
*
|
|
||||||
* Threads can block in arc_get_data_buf() waiting for this thread to evict
|
* Threads can block in arc_get_data_buf() waiting for this thread to evict
|
||||||
* enough data and signal them to proceed. When this happens, the threads in
|
* enough data and signal them to proceed. When this happens, the threads in
|
||||||
* arc_get_data_buf() are sleeping while holding the hash lock for their
|
* arc_get_data_buf() are sleeping while holding the hash lock for their
|
||||||
@ -4862,6 +4926,9 @@ arc_init(void)
|
|||||||
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
|
||||||
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
|
bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
|
||||||
|
|
||||||
|
arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri,
|
||||||
|
max_ncpus, INT_MAX, TASKQ_PREPOPULATE);
|
||||||
|
|
||||||
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
|
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
|
||||||
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
|
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
|
||||||
|
|
||||||
@ -4943,6 +5010,9 @@ arc_fini(void)
|
|||||||
arc_ksp = NULL;
|
arc_ksp = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
taskq_wait(arc_prune_taskq);
|
||||||
|
taskq_destroy(arc_prune_taskq);
|
||||||
|
|
||||||
mutex_enter(&arc_prune_mtx);
|
mutex_enter(&arc_prune_mtx);
|
||||||
while ((p = list_head(&arc_prune_list)) != NULL) {
|
while ((p = list_head(&arc_prune_list)) != NULL) {
|
||||||
list_remove(&arc_prune_list, p);
|
list_remove(&arc_prune_list, p);
|
||||||
@ -6374,6 +6444,9 @@ module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
|
|||||||
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
|
MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
|
||||||
"Limit number of restarts in arc_adjust_meta");
|
"Limit number of restarts in arc_adjust_meta");
|
||||||
|
|
||||||
|
module_param(zfs_arc_meta_strategy, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy");
|
||||||
|
|
||||||
module_param(zfs_arc_grow_retry, int, 0644);
|
module_param(zfs_arc_grow_retry, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user