From 37db7d8cf9936e6d2851a4329c11efcd9f61305c Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 12 Feb 2009 13:32:10 -0800 Subject: [PATCH] kmem slab fixes - Default SPL_KMEM_CACHE_DELAY changed to 15 to match Solaris. - Aged out slab checking occurs every SPL_KMEM_CACHE_DELAY / 3. - skc->skc_reap tunable added whichs allows callers of spl_slab_reclaim() to cap the number of slabs reclaimed. On Solaris all eligible slabs are always reclaimed, and this is still the default behavior. However, I suspect that is not always wise for reasons such as in the next comment. - spl_slab_reclaim() added cond_resched() while walking the slab/object free lists. Soft lockups were observed when freeing large numbers of vmalloc'd slabs/objets. - spl_slab_reclaim() 'sks->sks_ref > 0' check changes from incorrect 'break' to 'continue' to ensure all slabs are checked. - spl_cache_age() reworked to avoid a deadlock with do_flush_tlb_all() which occured because we slept waiting for completion in spl_cache_age(). To waiting for magazine reclamation to finish is not required so we no longer wait. - spl_magazine_create() and spl_magazine_destroy() shifted back to using for_each_online_cpu() instead of the spl_on_each_cpu() approach which was of course a bad idea due to memory allocations which Ricardo pointed out. --- include/sys/kmem.h | 4 +- module/spl/spl-kmem.c | 87 +++++++++++++++++++++++++------------------ 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 7281f1063..dc66a9153 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -239,7 +239,8 @@ extern struct rw_semaphore spl_kmem_cache_sem; #define SKS_MAGIC 0x22222222 #define SKC_MAGIC 0x2c2c2c2c -#define SPL_KMEM_CACHE_DELAY 5 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ #define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 /* Target objects per slab */ #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */ #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ @@ -292,6 +293,7 @@ typedef struct spl_kmem_cache { uint32_t skc_slab_objs; /* Objects per slab */ uint32_t skc_slab_size; /* Slab size */ uint32_t skc_delay; /* Slab reclaim interval */ + uint32_t skc_reap; /* Slab reclaim count */ atomic_t skc_ref; /* Ref count callers */ struct delayed_work skc_work; /* Slab reclaim work */ struct work_struct work; diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index b5cd9fb12..d82d7b49f 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -856,16 +856,19 @@ spl_slab_free(spl_kmem_slab_t *sks, /* * Traverses all the partial slabs attached to a cache and free those * which which are currently empty, and have not been touched for - * skc_delay seconds. This is to avoid thrashing. + * skc_delay seconds to avoid thrashing. The count argument is + * passed to optionally cap the number of slabs reclaimed, a count + * of zero means try and reclaim everything. When flag is set we + * always free an available slab regardless of age. */ static void -spl_slab_reclaim(spl_kmem_cache_t *skc, int flag) +spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) { spl_kmem_slab_t *sks, *m; spl_kmem_obj_t *sko, *n; LIST_HEAD(sks_list); LIST_HEAD(sko_list); - int size; + int size, i = 0; ENTRY; /* @@ -878,11 +881,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag) spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { - if (sks->sks_ref > 0) - break; + /* Release at most count slabs */ + if (count && i > count) + break; - if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)) + /* Skip active slabs */ + if (sks->sks_ref > 0) + continue; + + if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { spl_slab_free(sks, &sks_list, &sko_list); + i++; + } } spin_unlock(&skc->skc_lock); @@ -896,12 +906,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag) size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) + P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align); - list_for_each_entry_safe(sko, n, &sko_list, sko_list) + /* To avoid soft lockups conditionally reschedule */ + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { kv_free(skc, sko->sko_addr, size); + cond_resched(); + } } - list_for_each_entry_safe(sks, m, &sks_list, sks_list) + /* To avoid soft lockups conditionally reschedule */ + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { kv_free(skc, sks, skc->skc_slab_size); + cond_resched(); + } EXIT; } @@ -937,11 +953,11 @@ spl_cache_age(void *data) spl_get_work_data(data, spl_kmem_cache_t, skc_work.work); ASSERT(skc->skc_magic == SKC_MAGIC); - spl_on_each_cpu(spl_magazine_age, skc, 1); - spl_slab_reclaim(skc, 0); + spl_slab_reclaim(skc, skc->skc_reap, 0); + spl_on_each_cpu(spl_magazine_age, skc, 0); if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) - schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ); + schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); } /* @@ -1057,49 +1073,47 @@ spl_magazine_free(spl_kmem_magazine_t *skm) EXIT; } -static void -__spl_magazine_create(void *data) -{ - spl_kmem_cache_t *skc = data; - int id = smp_processor_id(); - - skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id)); - ASSERT(skc->skc_mag[id]); -} - /* * Create all pre-cpu magazines of reasonable sizes. */ static int spl_magazine_create(spl_kmem_cache_t *skc) { + int i; ENTRY; skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; - spl_on_each_cpu(__spl_magazine_create, skc, 1); + + for_each_online_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i)); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + RETURN(-ENOMEM); + } + } RETURN(0); } -static void -__spl_magazine_destroy(void *data) -{ - spl_kmem_cache_t *skc = data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; - - (void)spl_cache_flush(skc, skm, skm->skm_avail); - spl_magazine_free(skm); -} - /* * Destroy all pre-cpu magazines. */ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { + spl_kmem_magazine_t *skm; + int i; ENTRY; - spl_on_each_cpu(__spl_magazine_destroy, skc, 1); + + for_each_online_cpu(i) { + skm = skc->skc_mag[i]; + (void)spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + EXIT; } @@ -1168,6 +1182,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); @@ -1209,7 +1224,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, GOTO(out, rc); spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc); - schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ); + schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); down_write(&spl_kmem_cache_sem); list_add_tail(&skc->skc_list, &spl_kmem_cache_list); @@ -1249,7 +1264,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) wait_event(wq, atomic_read(&skc->skc_ref) == 0); spl_magazine_destroy(skc); - spl_slab_reclaim(skc, 1); + spl_slab_reclaim(skc, 0, 1); spin_lock(&skc->skc_lock); /* Validate there are no objects in use and free all the @@ -1654,7 +1669,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) if (skc->skc_reclaim) skc->skc_reclaim(skc->skc_private); - spl_slab_reclaim(skc, 0); + spl_slab_reclaim(skc, skc->skc_reap, 0); clear_bit(KMC_BIT_REAPING, &skc->skc_flags); atomic_dec(&skc->skc_ref);