From fece7c99bf18c04993b679c6fca0dd4669361716 Mon Sep 17 00:00:00 2001 From: behlendo Date: Sat, 28 Jun 2008 05:04:46 +0000 Subject: [PATCH] Victory! I've reworked caches with large objects which are based by vmalloc()'ed memory. I now alloc a slab which is roughly 32*spl_obj_size and in this block of memory I place the slab descriptor, slab object descriptors, and objects themselves. This greatly reduces vmalloc lock contention. Still some minor cleanup remains and fine tuning but it's working pretty well. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@139 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c --- include/sys/kmem.h | 1 - modules/spl/spl-kmem.c | 234 ++++++++++++++++++++++++++----------- modules/splat/splat-kmem.c | 50 +++++--- 3 files changed, 198 insertions(+), 87 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 47ac72e77..3c1770052 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -485,7 +485,6 @@ typedef struct spl_kmem_magazine { typedef struct spl_kmem_obj { uint32_t sko_magic; /* Sanity magic */ - uint32_t sko_flags; /* Per object flags */ void *sko_addr; /* Buffer address */ struct spl_kmem_slab *sko_slab; /* Owned by slab */ struct list_head sko_list; /* Free object list linkage */ diff --git a/modules/spl/spl-kmem.c b/modules/spl/spl-kmem.c index 0ee04a287..be20c5b44 100644 --- a/modules/spl/spl-kmem.c +++ b/modules/spl/spl-kmem.c @@ -167,17 +167,9 @@ static struct shrinker spl_kmem_cache_shrinker = { }; #endif -static spl_kmem_slab_t * -spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { - spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko, *n; - int i; - ENTRY; - - sks = kmem_cache_alloc(spl_slab_cache, flags); - if (sks == NULL) - RETURN(sks); - +static void +spl_slab_init(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ sks->sks_magic = SKS_MAGIC; sks->sks_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; sks->sks_age = jiffies; @@ -185,91 +177,201 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { INIT_LIST_HEAD(&sks->sks_list); INIT_LIST_HEAD(&sks->sks_free_list); sks->sks_ref = 0; +} +static int +spl_slab_alloc_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks, int flags) +{ + spl_kmem_obj_t *sko, *n; + int i, rc = 0; + + /* This is based on the linux slab cache for now simply because + * it means I get slab coloring, hardware cache alignment, etc + * for free. There's no reason we can't do this ourselves. And + * we probably should at in the future. For now I'll just + * leverage the existing linux slab here. */ for (i = 0; i < sks->sks_objs; i++) { sko = kmem_cache_alloc(spl_obj_cache, flags); if (sko == NULL) { -out_alloc: - /* Unable to fully construct slab, objects, - * and object data buffers unwind everything. - */ - list_for_each_entry_safe(sko, n, &sks->sks_free_list, - sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); - vmem_free(sko->sko_addr, skc->skc_obj_size); - list_del(&sko->sko_list); - kmem_cache_free(spl_obj_cache, sko); - } - - kmem_cache_free(spl_slab_cache, sks); - GOTO(out, sks = NULL); + rc = -ENOMEM; + break; } - /* Objects less than a page can use kmem_alloc() and avoid - * the locking overhead in __get_vm_area_node() when locking - * for a free address. For objects over a page we use - * vmem_alloc() because it is usually worth paying this - * overhead to avoid the need to find contigeous pages. - * This should give us the best of both worlds. */ - if (skc->skc_obj_size <= PAGE_SIZE) - sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags); - else - sko->sko_addr = vmem_alloc(skc->skc_obj_size, flags); - + sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags); if (sko->sko_addr == NULL) { kmem_cache_free(spl_obj_cache, sko); - GOTO(out_alloc, sks = NULL); + rc = -ENOMEM; + break; } sko->sko_magic = SKO_MAGIC; - sko->sko_flags = 0; sko->sko_slab = sks; INIT_LIST_HEAD(&sko->sko_list); INIT_HLIST_NODE(&sko->sko_hlist); list_add(&sko->sko_list, &sks->sks_free_list); } + + /* Unable to fully construct slab, unwind everything */ + if (rc) { + list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + kmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + } + } + + RETURN(rc); +} + +static spl_kmem_slab_t * +spl_slab_alloc_vmem(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *sko_base; + void *slab, *obj, *obj_base; + int i, size; + + /* For large vmem_alloc'ed buffers it's important that we pack the + * spl_kmem_obj_t structure and the actual objects in to one large + * virtual address zone to minimize the number of calls to + * vmalloc(). Mapping the virtual address in done under a single + * global lock which walks a list of all virtual zones. So doing + * lots of allocations simply results in lock contention and a + * longer list of mapped addresses. It is far better to do a + * few large allocations and then subdivide it ourselves. The + * large vmem_alloc'ed space is divied as follows: + * + * 1 slab struct: sizeof(spl_kmem_slab_t) + * N obj structs: sizeof(spl_kmem_obj_t) * skc->skc_objs + * N objects: skc->skc_obj_size * skc->skc_objs + * + * XXX: It would probably be a good idea to more carefully + * align the starts of these objects in memory. + */ + size = sizeof(spl_kmem_slab_t) + SPL_KMEM_CACHE_OBJ_PER_SLAB * + (skc->skc_obj_size + sizeof(spl_kmem_obj_t)); + + slab = vmem_alloc(size, flags); + if (slab == NULL) + RETURN(NULL); + + sks = (spl_kmem_slab_t *)slab; + spl_slab_init(skc, sks); + + sko_base = (spl_kmem_obj_t *)(slab + sizeof(spl_kmem_slab_t)); + obj_base = (void *)sko_base + sizeof(spl_kmem_obj_t) * sks->sks_objs; + + for (i = 0; i < sks->sks_objs; i++) { + sko = &sko_base[i]; + obj = obj_base + skc->skc_obj_size * i; + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + INIT_HLIST_NODE(&sko->sko_hlist); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + + RETURN(sks); +} + +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko; + int rc; + ENTRY; + + /* Objects less than a page can use kmem_alloc() and avoid + * the locking overhead in __get_vm_area_node() when locking + * for a free address. For objects over a page we use + * vmem_alloc() because it is usually worth paying this + * overhead to avoid the need to find contigeous pages. + * This should give us the best of both worlds. */ + if (skc->skc_obj_size <= PAGE_SIZE) { + sks = kmem_cache_alloc(spl_slab_cache, flags); + if (sks == NULL) + GOTO(out, sks = NULL); + + spl_slab_init(skc, sks); + + rc = spl_slab_alloc_kmem(skc, sks, flags); + if (rc) { + kmem_cache_free(spl_slab_cache, sks); + GOTO(out, sks = NULL); + } + } else { + sks = spl_slab_alloc_vmem(skc, flags); + if (sks == NULL) + GOTO(out, sks = NULL); + } + + ASSERT(sks); + list_for_each_entry(sko, &sks->sks_free_list, sko_list) + if (skc->skc_ctor) + skc->skc_ctor(sko->sko_addr, skc->skc_private, flags); out: RETURN(sks); } +static void +spl_slab_free_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko, *n; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + kmem_free(sko->sko_addr, skc->skc_obj_size); + list_del(&sko->sko_list); + kmem_cache_free(spl_obj_cache, sko); + } + + kmem_cache_free(spl_slab_cache, sks); +} + +static void +spl_slab_free_vmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + vmem_free(sks, SPL_KMEM_CACHE_OBJ_PER_SLAB * + (skc->skc_obj_size + sizeof(spl_kmem_obj_t))); +} + /* Removes slab from complete or partial list, so it must * be called with the 'skc->skc_lock' held. - * */ + */ static void spl_slab_free(spl_kmem_slab_t *sks) { spl_kmem_cache_t *skc; spl_kmem_obj_t *sko, *n; - int i = 0; ENTRY; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref == 0); - skc = sks->sks_cache; - skc->skc_obj_total -= sks->sks_objs; - skc->skc_slab_total--; + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(spin_is_locked(&skc->skc_lock)); - list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); - /* Run destructors for being freed */ + /* Run destructors slab is being released */ + list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) if (skc->skc_dtor) skc->skc_dtor(sko->sko_addr, skc->skc_private); - if (skc->skc_obj_size <= PAGE_SIZE) - kmem_free(sko->sko_addr, skc->skc_obj_size); - else - vmem_free(sko->sko_addr, skc->skc_obj_size); - - list_del(&sko->sko_list); - kmem_cache_free(spl_obj_cache, sko); - i++; - } - - ASSERT(sks->sks_objs == i); - list_del(&sks->sks_list); - kmem_cache_free(spl_slab_cache, sks); + if (skc->skc_obj_size <= PAGE_SIZE) + spl_slab_free_kmem(skc, sks); + else + spl_slab_free_vmem(skc, sks); EXIT; } @@ -629,14 +731,13 @@ static spl_kmem_slab_t * spl_cache_grow(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko; cycles_t start; ENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); if (flags & __GFP_WAIT) { -// flags |= __GFP_NOFAIL; /* XXX: Solaris assumes this */ + flags |= __GFP_NOFAIL; might_sleep(); local_irq_enable(); } @@ -649,14 +750,6 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags) RETURN(NULL); } - /* Run all the constructors now that the slab is fully allocated */ - list_for_each_entry(sko, &sks->sks_free_list, sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); - - if (skc->skc_ctor) - skc->skc_ctor(sko->sko_addr, skc->skc_private, flags); - } - if (flags & __GFP_WAIT) local_irq_disable(); @@ -697,7 +790,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); - if (unlikely((get_cycles() - start) > skc->skc_lock_refill)) + if (unlikely((get_cycles()-start)>skc->skc_lock_refill)) skc->skc_lock_refill = get_cycles() - start; sks = spl_cache_grow(skc, flags); @@ -861,6 +954,7 @@ restart: } local_irq_restore(irq_flags); + ASSERT(obj); /* Pre-emptively migrate object to CPU L1 cache */ prefetchw(obj); diff --git a/modules/splat/splat-kmem.c b/modules/splat/splat-kmem.c index de9b36841..49715152d 100644 --- a/modules/splat/splat-kmem.c +++ b/modules/splat/splat-kmem.c @@ -559,36 +559,36 @@ splat_kmem_test8_count(kmem_cache_priv_t *kcp, int threads) * eyeball the slab cache locking overhead to ensure it is reasonable. */ static int -splat_kmem_test8(struct file *file, void *arg) +splat_kmem_test8_sc(struct file *file, void *arg, int size, int count) { kmem_cache_priv_t kcp; kthread_t *thr; struct timespec start, stop, delta; - char cache_name[16]; - int alloc, i; + char cache_name[32]; + int i, j, threads = 32; kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC; kcp.kcp_file = file; - splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s", + splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %s", "name", "time (sec)\tslabs \tobjs \thash\n"); - splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s", + splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %s", "", " \ttot/max/calc\ttot/max/calc\tsize/depth\n"); - for (alloc = 1; alloc <= 4096; alloc *= 2) { - kcp.kcp_size = 256; + for (i = 1; i <= count; i *= 2) { + kcp.kcp_size = size; kcp.kcp_count = 0; kcp.kcp_threads = 0; - kcp.kcp_alloc = alloc; + kcp.kcp_alloc = i; kcp.kcp_rc = 0; spin_lock_init(&kcp.kcp_lock); init_waitqueue_head(&kcp.kcp_waitq); - sprintf(cache_name, "%s-%d", SPLAT_KMEM_CACHE_NAME, alloc); + sprintf(cache_name, "%s-%d-%d", SPLAT_KMEM_CACHE_NAME, size, i); kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0, - splat_kmem_cache_test_constructor, - splat_kmem_cache_test_destructor, - NULL, &kcp, NULL, 0); + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + NULL, &kcp, NULL, 0); if (!kcp.kcp_cache) { splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "Unable to create '%s' cache\n", @@ -598,7 +598,7 @@ splat_kmem_test8(struct file *file, void *arg) start = current_kernel_time(); - for (i = 0; i < 32; i++) { + for (j = 0; j < threads; j++) { thr = thread_create(NULL, 0, splat_kmem_test8_thread, &kcp, 0, &p0, TS_RUN, minclsyspri); ASSERT(thr != NULL); @@ -610,15 +610,17 @@ splat_kmem_test8(struct file *file, void *arg) stop = current_kernel_time(); delta = timespec_sub(stop, start); - splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%2ld.%09ld\t" + splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %2ld.%09ld\t" "%lu/%lu/%lu\t%lu/%lu/%lu\t%lu/%lu\n", + kcp.kcp_cache->skc_name, delta.tv_sec, delta.tv_nsec, (unsigned long)kcp.kcp_cache->skc_slab_total, (unsigned long)kcp.kcp_cache->skc_slab_max, - (unsigned long)(kcp.kcp_alloc * 32 / SPL_KMEM_CACHE_OBJ_PER_SLAB), + (unsigned long)(kcp.kcp_alloc * threads / + SPL_KMEM_CACHE_OBJ_PER_SLAB), (unsigned long)kcp.kcp_cache->skc_obj_total, (unsigned long)kcp.kcp_cache->skc_obj_max, - (unsigned long)(kcp.kcp_alloc * 32), + (unsigned long)(kcp.kcp_alloc * threads), (unsigned long)kcp.kcp_cache->skc_hash_size, (unsigned long)kcp.kcp_cache->skc_hash_depth); @@ -631,6 +633,22 @@ splat_kmem_test8(struct file *file, void *arg) return kcp.kcp_rc; } +static int +splat_kmem_test8(struct file *file, void *arg) +{ + int i, rc = 0; + + /* Run through slab cache with objects size from + * 16-1Mb in 4x multiples with 1024 objects each */ + for (i = 16; i <= 1024*1024; i *= 4) { + rc = splat_kmem_test8_sc(file, arg, i, 1024); + if (rc) + break; + } + + return rc; +} + splat_subsystem_t * splat_kmem_init(void) {