From 026e529cb336d1d656eaa33f58ac911a70f273cd Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Sun, 19 Jul 2020 09:58:30 -0700 Subject: [PATCH] Remove skc_reclaim, hdr_recl, kmem_cache shrinker The SPL kmem_cache implementation provides a mechanism, `skc_reclaim`, whereby individual caches can register a callback to be invoked when there is memory pressure. This mechanism is used in only one place: the ARC registers the `hdr_recl()` reclaim function. This function wakes up the `arc_reap_zthr`, whose job is to call `kmem_cache_reap()` and `arc_reduce_target_size()`. The `skc_reclaim` callbacks are invoked only by shrinker callbacks and `arc_reap_zthr`, and only callback only wakes up `arc_reap_zthr`. When called from `arc_reap_zthr`, waking `arc_reap_zthr` is a no-op. When called from shrinker callbacks, we are already aware of memory pressure and responding to it. Therefore there is little benefit to ever calling the `hdr_recl()` `skc_reclaim` callback. The `arc_reap_zthr` also wakes once a second, and if memory is low when allocating an ARC buffer. Therefore, additionally waking it from the shrinker calbacks has little benefit. The shrinker callbacks can be invoked very frequently, e.g. 10,000 times per second. Additionally, for invocation of the shrinker callback, skc_reclaim is invoked many times. Therefore, this mechanism consumes significant amounts of CPU time. The kmem_cache shrinker calls `spl_kmem_cache_reap_now()`, which, in addition to invoking `skc_reclaim()`, does two things to attempt to free pages for use by the system: 1. Return free objects from the magazine layer to the slab layer 2. Return entirely-free slabs to the page layer (i.e. free pages) These actions apply only to caches implemented by the SPL, not those that use the underlying kernel SLAB/SLUB caches. The SPL caches are used for objects >=32KB, which are primarily linear ABD's cached in the DBUF cache. These actions (freeing objects from the magazine layer and returning entirely-free slabs) are also taken whenever a `kmem_cache_free()` call finds a full magazine. So there would typically be zero entirely-free slabs, and the number of objects in magazines is limited (typically no more than 64 objects per magazine, and there's one magazine per CPU). Therefore the benefit of `spl_kmem_cache_reap_now()`, while nonzero, is modest. We also call `spl_kmem_cache_reap_now()` from the `arc_reap_zthr`, when memory pressure is detected. Therefore, calling `spl_kmem_cache_reap_now()` from the kmem_cache shrinker is not needed. This commit removes the `skc_reclaim` mechanism, its only callback `hdr_recl()`, and the kmem_cache shrinker callback. Reviewed-By: Brian Behlendorf Reviewed-by: George Wilson Reviewed-by: Pavel Zakharov Signed-off-by: Matthew Ahrens Closes #10576 --- include/os/linux/spl/sys/kmem_cache.h | 4 +- module/os/linux/spl/spl-kmem-cache.c | 140 +++----------------------- module/zfs/arc.c | 42 +------- 3 files changed, 20 insertions(+), 166 deletions(-) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index ce18eb474..30d2dd5ed 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -124,7 +124,6 @@ extern struct rw_semaphore spl_kmem_cache_sem; typedef int (*spl_kmem_ctor_t)(void *, void *, int); typedef void (*spl_kmem_dtor_t)(void *, void *); -typedef void (*spl_kmem_reclaim_t)(void *); typedef struct spl_kmem_magazine { uint32_t skm_magic; /* Sanity magic */ @@ -174,7 +173,6 @@ typedef struct spl_kmem_cache { uint32_t skc_mag_refill; /* Magazine refill count */ spl_kmem_ctor_t skc_ctor; /* Constructor */ spl_kmem_dtor_t skc_dtor; /* Destructor */ - spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ void *skc_private; /* Private data */ void *skc_vmp; /* Unused */ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ @@ -210,7 +208,7 @@ typedef struct spl_kmem_cache { extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); + void *reclaim, void *priv, void *vmp, int flags); extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 034727410..76b89b254 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -883,7 +882,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) */ spl_kmem_cache_t * spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); @@ -897,6 +896,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, ASSERT0(flags & KMC_NOHASH); ASSERT0(flags & KMC_QCACHE); ASSERT(vmp == NULL); + ASSERT(reclaim == NULL); might_sleep(); @@ -915,7 +915,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_ctor = ctor; skc->skc_dtor = dtor; - skc->skc_reclaim = reclaim; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; @@ -1606,78 +1605,11 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) EXPORT_SYMBOL(spl_kmem_cache_free); /* - * The generic shrinker function for all caches. Under Linux a shrinker - * may not be tightly coupled with a slab cache. In fact Linux always - * systematically tries calling all registered shrinker callbacks which - * report that they contain unused objects. Because of this we only - * register one shrinker function in the shim layer for all slab caches. - * We always attempt to shrink all caches when this generic shrinker - * is called. - * - * The _count() function returns the number of free-able objects. - * The _scan() function returns the number of objects that were freed. - */ -static unsigned long -spl_kmem_cache_shrinker_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc = NULL; - int alloc = 0; - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - alloc += skc->skc_obj_alloc; - } - up_read(&spl_kmem_cache_sem); - - return (MAX(alloc, 0)); -} - -static unsigned long -spl_kmem_cache_shrinker_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc = NULL; - int alloc = 0; - - /* - * No shrinking in a transaction context. Can cause deadlocks. - */ - if (spl_fstrans_check()) - return (SHRINK_STOP); - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - uint64_t oldalloc = skc->skc_obj_alloc; - spl_kmem_cache_reap_now(skc); - if (oldalloc > skc->skc_obj_alloc) - alloc += oldalloc - skc->skc_obj_alloc; - } - up_read(&spl_kmem_cache_sem); - - /* - * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. - * This functionality only exists to work around a rare issue where - * shrink_slabs() is repeatedly invoked by many cores causing the - * system to thrash. - */ - if (spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) - return (SHRINK_STOP); - - return (MAX(alloc, 0)); -} - -SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, - spl_kmem_cache_shrinker_count, spl_kmem_cache_shrinker_scan, - KMC_DEFAULT_SEEKS); - -/* - * Call the registered reclaim function for a cache. Depending on how - * many and which objects are released it may simply repopulate the - * local magazine which will then need to age-out. Objects which cannot - * fit in the magazine we will be released back to their slabs which will - * also need to age out before being release. This is all just best - * effort and we do not want to thrash creating and destroying slabs. + * Depending on how many and which objects are released it may simply + * repopulate the local magazine which will then need to age-out. Objects + * which cannot fit in the magazine will be released back to their slabs + * which will also need to age out before being released. This is all just + * best effort and we do not want to thrash creating and destroying slabs. */ void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) @@ -1685,16 +1617,10 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - atomic_inc(&skc->skc_ref); + if (skc->skc_flags & KMC_SLAB) + return; - /* - * Execute the registered reclaim callback if it exists. - */ - if (skc->skc_flags & KMC_SLAB) { - if (skc->skc_reclaim) - skc->skc_reclaim(skc->skc_private); - goto out; - } + atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. @@ -1702,39 +1628,6 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; - /* - * When a reclaim function is available it may be invoked repeatedly - * until at least a single slab can be freed. This ensures that we - * do free memory back to the system. This helps minimize the chance - * of an OOM event when the bulk of memory is used by the slab. - * - * When free slabs are already available the reclaim callback will be - * skipped. Additionally, if no forward progress is detected despite - * a reclaim function the cache will be skipped to avoid deadlock. - * - * Longer term this would be the correct place to add the code which - * repacks the slabs in order minimize fragmentation. - */ - if (skc->skc_reclaim) { - uint64_t objects = UINT64_MAX; - int do_reclaim; - - do { - spin_lock(&skc->skc_lock); - do_reclaim = - (skc->skc_slab_total > 0) && - ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && - (skc->skc_obj_alloc < objects); - - objects = skc->skc_obj_alloc; - spin_unlock(&skc->skc_lock); - - if (do_reclaim) - skc->skc_reclaim(skc->skc_private); - - } while (do_reclaim); - } - /* Reclaim from the magazine and free all now empty slabs. */ if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { spl_kmem_magazine_t *skm; @@ -1773,12 +1666,13 @@ EXPORT_SYMBOL(spl_kmem_cache_reap_active); void spl_kmem_reap(void) { - struct shrink_control sc; + spl_kmem_cache_t *skc = NULL; - sc.nr_to_scan = KMC_REAP_CHUNK; - sc.gfp_mask = GFP_KERNEL; - - (void) spl_kmem_cache_shrinker_scan(NULL, &sc); + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + spl_kmem_cache_reap_now(skc); + } + up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); @@ -1791,7 +1685,6 @@ spl_kmem_cache_init(void) spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - spl_register_shrinker(&spl_kmem_cache_shrinker); return (0); } @@ -1799,6 +1692,5 @@ spl_kmem_cache_init(void) void spl_kmem_cache_fini(void) { - spl_unregister_shrinker(&spl_kmem_cache_shrinker); taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index ea22686cc..5b9df43d2 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -380,11 +380,6 @@ static int arc_min_prescient_prefetch_ms; */ int arc_lotsfree_percent = 10; -/* - * hdr_recl() uses this to determine if the arc is up and running. - */ -static boolean_t arc_initialized; - /* * The arc has filled available memory and has now warmed up. */ @@ -1198,22 +1193,6 @@ buf_dest(void *vbuf, void *unused) arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } -/* - * Reclaim callback -- invoked when memory is low. - */ -/* ARGSUSED */ -static void -hdr_recl(void *unused) -{ - dprintf("hdr_recl called\n"); - /* - * umem calls the reclaim func when we destroy the buf cache, - * which is after we do arc_fini(). - */ - if (arc_initialized) - zthr_wakeup(arc_reap_zthr); -} - static void buf_init(void) { @@ -1249,12 +1228,12 @@ retry: } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - hdr_recl, NULL, NULL, 0); + NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", - HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -4688,9 +4667,6 @@ arc_kmem_reap_soon(void) static boolean_t arc_adjust_cb_check(void *arg, zthr_t *zthr) { - if (!arc_initialized) - return (B_FALSE); - /* * This is necessary so that any changes which may have been made to * many of the zfs_arc_* module parameters will be propagated to @@ -4778,9 +4754,6 @@ arc_adjust_cb(void *arg, zthr_t *zthr) static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { - if (!arc_initialized) - return (B_FALSE); - int64_t free_memory = arc_available_memory(); /* @@ -7348,12 +7321,6 @@ arc_init(void) arc_state_init(); - /* - * The arc must be "uninitialized", so that hdr_recl() (which is - * registered by buf_init()) will not access arc_reap_zthr before - * it is created. - */ - ASSERT(!arc_initialized); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), @@ -7377,7 +7344,6 @@ arc_init(void) arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); - arc_initialized = B_TRUE; arc_warm = B_FALSE; /* @@ -7412,8 +7378,6 @@ arc_fini(void) /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); - arc_initialized = B_FALSE; - if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL;