From 7cb67b45f33fd7a61af24c675c7347eb5264b38c Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 29 Mar 2011 18:08:59 -0700 Subject: [PATCH] Add direct+indirect ARC reclaim Under OpenSolaris all memory reclaim is done asyncronously. Under Linux memory reclaim is done asynchronously _and_ synchronously. When a process allocates memory with GFP_KERNEL it explicitly allows the kernel to do reclaim on its behalf to satify the allocation. If that GFP_KERNEL allocation fails the kernel may take more drastic measures to reclaim the memory such as killing user space processes. This was observed to happen with ZFS because the ARC could consume a large fraction of the system memory but no synchronous reclaim could be performed on it. The result was GFP_KERNEL allocations could fail resulting in OOM events, and only moments latter the arc_reclaim thread would free unused memory from the ARC. This change leaves the arc_thread in place to manage the fundamental ARC behavior. But it adds a synchronous (direct) reclaim path for the ARC which can be called when memory is badly needed. It also adds an asynchronous (indirect) reclaim path which is called much more frequently to prune the ARC slab caches. --- module/zfs/arc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 63b659b9a..9ca8ad17d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -282,6 +282,8 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; @@ -344,6 +346,8 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, @@ -2171,6 +2175,51 @@ arc_reclaim_thread(void) thread_exit(); } +#ifdef _KERNEL +/* + * Under Linux the arc shrinker may be called for synchronous (direct) + * reclaim, or asynchronous (indirect) reclaim. When called by kswapd + * for indirect reclaim we take a conservative approach and just reap + * free slabs from the ARC caches. If this proves to be insufficient + * direct reclaim will be trigger. In direct reclaim a more aggressive + * strategy is used, data is evicted from the ARC and free slabs reaped. + */ +SPL_SHRINKER_CALLBACK_PROTO(arc_shrinker_func, cb, nr_to_scan, gfp_mask) +{ + arc_reclaim_strategy_t strategy; + int arc_reclaim; + + /* Not allowed to perform filesystem reclaim */ + if (!(gfp_mask & __GFP_FS)) + return (-1); + + /* Return number of reclaimable pages based on arc_shrink_shift */ + arc_reclaim = btop((arc_size - arc_c_min)) >> arc_shrink_shift; + if (nr_to_scan == 0) + return (arc_reclaim); + + /* Reclaim in progress */ + if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + return (-1); + + if (current_is_kswapd()) { + strategy = ARC_RECLAIM_CONS; + ARCSTAT_INCR(arcstat_memory_indirect_count, 1); + } else { + strategy = ARC_RECLAIM_AGGR; + ARCSTAT_INCR(arcstat_memory_direct_count, 1); + } + + arc_kmem_reap_now(strategy); + arc_reclaim = btop((arc_size - arc_c_min)) >> arc_shrink_shift; + mutex_exit(&arc_reclaim_thr_lock); + + return (arc_reclaim); +} + +SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); +#endif /* _KERNEL */ + /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -3485,6 +3534,12 @@ arc_init(void) * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ @@ -3602,6 +3657,10 @@ void arc_fini(void) { mutex_enter(&arc_reclaim_thr_lock); +#ifdef _KERNEL + spl_unregister_shrinker(&arc_shrinker); +#endif /* _KERNEL */ + arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);