Add direct+indirect ARC reclaim

Under OpenSolaris all memory reclaim is done asyncronously.  Under
Linux memory reclaim is done asynchronously _and_ synchronously.
When a process allocates memory with GFP_KERNEL it explicitly allows
the kernel to do reclaim on its behalf to satify the allocation.
If that GFP_KERNEL allocation fails the kernel may take more drastic
measures to reclaim the memory such as killing user space processes.

This was observed to happen with ZFS because the ARC could consume
a large fraction of the system memory but no synchronous reclaim
could be performed on it.  The result was GFP_KERNEL allocations
could fail resulting in OOM events, and only moments latter the
arc_reclaim thread would free unused memory from the ARC.

This change leaves the arc_thread in place to manage the fundamental
ARC behavior.  But it adds a synchronous (direct) reclaim path for
the ARC which can be called when memory is badly needed.  It also
adds an asynchronous (indirect) reclaim path which is called
much more frequently to prune the ARC slab caches.
This commit is contained in:
Brian Behlendorf 2011-03-29 18:08:59 -07:00
parent 1834f2d8b7
commit 7cb67b45f3

View File

@ -282,6 +282,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
kstat_named_t arcstat_no_grow;
kstat_named_t arcstat_tempreserve;
kstat_named_t arcstat_loaned_bytes;
@ -344,6 +346,8 @@ static arc_stats_t arc_stats = {
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "memory_direct_count", KSTAT_DATA_UINT64 },
{ "memory_indirect_count", KSTAT_DATA_UINT64 },
{ "arc_no_grow", KSTAT_DATA_UINT64 },
{ "arc_tempreserve", KSTAT_DATA_UINT64 },
{ "arc_loaned_bytes", KSTAT_DATA_UINT64 },
@ -2171,6 +2175,51 @@ arc_reclaim_thread(void)
thread_exit();
}
#ifdef _KERNEL
/*
* Under Linux the arc shrinker may be called for synchronous (direct)
* reclaim, or asynchronous (indirect) reclaim. When called by kswapd
* for indirect reclaim we take a conservative approach and just reap
* free slabs from the ARC caches. If this proves to be insufficient
* direct reclaim will be trigger. In direct reclaim a more aggressive
* strategy is used, data is evicted from the ARC and free slabs reaped.
*/
SPL_SHRINKER_CALLBACK_PROTO(arc_shrinker_func, cb, nr_to_scan, gfp_mask)
{
arc_reclaim_strategy_t strategy;
int arc_reclaim;
/* Not allowed to perform filesystem reclaim */
if (!(gfp_mask & __GFP_FS))
return (-1);
/* Return number of reclaimable pages based on arc_shrink_shift */
arc_reclaim = btop((arc_size - arc_c_min)) >> arc_shrink_shift;
if (nr_to_scan == 0)
return (arc_reclaim);
/* Reclaim in progress */
if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
return (-1);
if (current_is_kswapd()) {
strategy = ARC_RECLAIM_CONS;
ARCSTAT_INCR(arcstat_memory_indirect_count, 1);
} else {
strategy = ARC_RECLAIM_AGGR;
ARCSTAT_INCR(arcstat_memory_direct_count, 1);
}
arc_kmem_reap_now(strategy);
arc_reclaim = btop((arc_size - arc_c_min)) >> arc_shrink_shift;
mutex_exit(&arc_reclaim_thr_lock);
return (arc_reclaim);
}
SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
#endif /* _KERNEL */
/*
* Adapt arc info given the number of bytes we are trying to add and
* the state that we are comming from. This function is only called
@ -3485,6 +3534,12 @@ arc_init(void)
* need to limit the cache to 1/8 of VM size.
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
/*
* Register a shrinker to support synchronous (direct) memory
* reclaim from the arc. This is done to prevent kswapd from
* swapping out pages when it is preferable to shrink the arc.
*/
spl_register_shrinker(&arc_shrinker);
#endif
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
@ -3602,6 +3657,10 @@ void
arc_fini(void)
{
mutex_enter(&arc_reclaim_thr_lock);
#ifdef _KERNEL
spl_unregister_shrinker(&arc_shrinker);
#endif /* _KERNEL */
arc_thread_exit = 1;
while (arc_thread_exit != 0)
cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);