diff --git a/config/spl-build.m4 b/config/spl-build.m4 index 08b84efe9..eef52334f 100644 --- a/config/spl-build.m4 +++ b/config/spl-build.m4 @@ -93,6 +93,7 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ SPL_AC_SCHED_RT_HEADER SPL_AC_2ARGS_VFS_GETATTR SPL_AC_USLEEP_RANGE + SPL_AC_KMEM_CACHE_ALLOCFLAGS ]) AC_DEFUN([SPL_AC_MODULE_SYMVERS], [ @@ -2532,3 +2533,40 @@ AC_DEFUN([SPL_AC_USLEEP_RANGE], [ AC_MSG_RESULT(no) ]) ]) + +dnl # +dnl # 2.6.35 API change, +dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are +dnl # private allocation flags which are applied when allocating a new slab +dnl # in kmem_getpages(). Unfortunately there is no public API for setting +dnl # non-default flags. +dnl # +AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [ + AC_MSG_CHECKING([whether struct kmem_cache has allocflags]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.allocflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1, + [struct kmem_cache has allocflags]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether struct kmem_cache has gfpflags]) + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.gfpflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1, + [struct kmem_cache has gfpflags]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) +]) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 516114fd7..18533fe39 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -340,8 +340,9 @@ enum { KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ KMC_BIT_KMEM = 5, /* Use kmem cache */ KMC_BIT_VMEM = 6, /* Use vmem cache */ - KMC_BIT_OFFSLAB = 7, /* Objects not on slab */ - KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */ + KMC_BIT_SLAB = 7, /* Use Linux slab cache */ + KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ + KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ @@ -367,6 +368,7 @@ typedef enum kmem_cbrc { #define KMC_QCACHE (1 << KMC_BIT_QCACHE) #define KMC_KMEM (1 << KMC_BIT_KMEM) #define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) #define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) #define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) @@ -456,6 +458,7 @@ typedef struct spl_kmem_cache { spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ void *skc_private; /* Private data */ void *skc_vmp; /* Unused */ + struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ unsigned long skc_flags; /* Flags */ uint32_t skc_obj_size; /* Object size */ uint32_t skc_obj_align; /* Object alignment */ @@ -513,4 +516,24 @@ void spl_kmem_fini(void); #define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ ((ptr) < (void *)VMALLOC_END)) +/* + * Allow custom slab allocation flags to be set for KMC_SLAB based caches. + * One use for this function is to ensure the __GFP_COMP flag is part of + * the default allocation mask which ensures higher order allocations are + * properly refcounted. This flag was added to the default ->allocflags + * as of Linux 3.11. + */ +static inline void +kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) +{ + if (skc->skc_linux_cache == NULL) + return; + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= flags; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= flags; +#endif +} + #endif /* _SPL_KMEM_H */ diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index b673c2915..fc04604d5 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -33,6 +33,16 @@ #define SS_DEBUG_SUBSYS SS_KMEM +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + /* * Cache expiration was implemented because it was part of the default Solaris * kmem_cache behavior. The idea is that per-cpu objects which haven't been @@ -60,6 +70,16 @@ unsigned int spl_kmem_cache_max_size = 32; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); +unsigned int spl_kmem_cache_slab_limit = 0; +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + /* * The minimum amount of memory measured in pages to be free at all * times on the system. This is similar to Linux's zone->pages_min @@ -1348,7 +1368,10 @@ spl_cache_age(void *data) return; atomic_inc(&skc->skc_ref); - spl_on_each_cpu(spl_magazine_age, skc, 1); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + spl_on_each_cpu(spl_magazine_age, skc, 1); + spl_slab_reclaim(skc, skc->skc_reap, 0); while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { @@ -1493,6 +1516,9 @@ spl_magazine_create(spl_kmem_cache_t *skc) int i; SENTRY; + if (skc->skc_flags & KMC_NOMAGAZINE) + SRETURN(0); + skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; @@ -1519,6 +1545,11 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) int i; SENTRY; + if (skc->skc_flags & KMC_NOMAGAZINE) { + SEXIT; + return; + } + for_each_online_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); @@ -1541,11 +1572,12 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) * flags * KMC_NOTOUCH Disable cache object aging (unsupported) * KMC_NODEBUG Disable debugging (unsupported) - * KMC_NOMAGAZINE Disable magazine (unsupported) * KMC_NOHASH Disable hashing (unsupported) * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab * KMC_KMEM Force kmem backed cache * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache * KMC_OFFSLAB Locate objects off the slab */ spl_kmem_cache_t * @@ -1591,6 +1623,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_reclaim = reclaim; skc->skc_private = priv; skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; @@ -1617,28 +1650,69 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; + /* + * Verify the requested alignment restriction is sane. + */ if (align) { VERIFY(ISP2(align)); - VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */ - VERIFY3U(align, <=, PAGE_SIZE); /* Max alignment */ + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); skc->skc_obj_align = align; } - /* If none passed select a cache type based on object size */ - if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) { - if (spl_obj_size(skc) < (PAGE_SIZE / 8)) + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characters are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ else skc->skc_flags |= KMC_VMEM; } - rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); - if (rc) - SGOTO(out, rc); + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + SGOTO(out, rc); - rc = spl_magazine_create(skc); - if (rc) - SGOTO(out, rc); + rc = spl_magazine_create(skc); + if (rc) + SGOTO(out, rc); + } else { + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, 0, NULL); + if (skc->skc_linux_cache == NULL) + SGOTO(out, rc = ENOMEM); + + kmem_cache_set_allocflags(skc, __GFP_COMP); + skc->skc_flags |= KMC_NOMAGAZINE; + } if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, @@ -1680,6 +1754,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) SENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); @@ -1699,8 +1774,14 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) * cache reaping action which races with this destroy. */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); - spl_magazine_destroy(skc); - spl_slab_reclaim(skc, 0, 1); + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc, 0, 1); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + spin_lock(&skc->skc_lock); /* Validate there are no objects in use and free all the @@ -1806,7 +1887,9 @@ spl_cache_reclaim_wait(void *word) } /* - * No available objects on any slabs, create a new slab. + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) @@ -1815,6 +1898,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) SENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); *obj = NULL; @@ -2016,7 +2100,28 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ASSERT(flags & KM_SLEEP); + atomic_inc(&skc->skc_ref); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + + do { + obj = kmem_cache_alloc(slc, flags | __GFP_COMP); + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + atomic_dec(&skc->skc_ref); + SRETURN(obj); + } + local_irq_disable(); restart: @@ -2068,6 +2173,17 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); atomic_inc(&skc->skc_ref); + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + kmem_cache_free(skc->skc_linux_cache, obj); + goto out; + } + /* * Only virtual slabs may have emergency objects and these objects * are guaranteed to have physical addresses. They must be removed @@ -2166,13 +2282,27 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - /* Prevent concurrent cache reaping when contended */ - if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) { - SEXIT; - return; + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. The + * per-cpu caches will be drained when is set KMC_EXPIRE_MEM. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) + kmem_cache_shrink(skc->skc_linux_cache); + + SGOTO(out, 0); } - atomic_inc(&skc->skc_ref); + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + SGOTO(out, 0); /* * When a reclaim function is available it may be invoked repeatedly @@ -2222,7 +2352,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) clear_bit(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_clear_bit(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); - +out: atomic_dec(&skc->skc_ref); SEXIT; diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index 2bbc8d790..b4be84fef 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -646,6 +646,12 @@ slab_seq_show(struct seq_file *f, void *p) ASSERT(skc->skc_magic == SKC_MAGIC); + /* + * Backed by Linux slab see /proc/slabinfo. + */ + if (skc->skc_flags & KMC_SLAB) + return (0); + spin_lock(&skc->skc_lock); seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index 25a52b43d..4d060c138 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -394,18 +394,25 @@ splat_kmem_cache_test_debug(struct file *file, char *name, { int j; - splat_vprint(file, name, - "%s cache objects %d, slabs %u/%u objs %u/%u mags ", - kcp->kcp_cache->skc_name, kcp->kcp_count, + splat_vprint(file, name, "%s cache objects %d", + kcp->kcp_cache->skc_name, kcp->kcp_count); + + if (kcp->kcp_cache->skc_flags & (KMC_KMEM | KMC_VMEM)) { + splat_vprint(file, name, ", slabs %u/%u objs %u/%u", (unsigned)kcp->kcp_cache->skc_slab_alloc, (unsigned)kcp->kcp_cache->skc_slab_total, (unsigned)kcp->kcp_cache->skc_obj_alloc, (unsigned)kcp->kcp_cache->skc_obj_total); - for_each_online_cpu(j) - splat_print(file, "%u/%u ", - kcp->kcp_cache->skc_mag[j]->skm_avail, - kcp->kcp_cache->skc_mag[j]->skm_size); + if (!(kcp->kcp_cache->skc_flags & KMC_NOMAGAZINE)) { + splat_vprint(file, name, "%s", "mags"); + + for_each_online_cpu(j) + splat_print(file, "%u/%u ", + kcp->kcp_cache->skc_mag[j]->skm_avail, + kcp->kcp_cache->skc_mag[j]->skm_size); + } + } splat_print(file, "%s\n", ""); } @@ -900,14 +907,14 @@ splat_kmem_test8(struct file *file, void *arg) kmem_cache_reap_now(kcp->kcp_cache); splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST8_NAME, kcp); - if (kcp->kcp_cache->skc_obj_total == 0) + if (kcp->kcp_count == 0) break; set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ / 10); } - if (kcp->kcp_cache->skc_obj_total == 0) { + if (kcp->kcp_count == 0) { splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "Successfully created %d objects " "in cache %s and reclaimed them\n", @@ -915,7 +922,7 @@ splat_kmem_test8(struct file *file, void *arg) } else { splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "Failed to reclaim %u/%d objects from cache %s\n", - (unsigned)kcp->kcp_cache->skc_obj_total, + (unsigned)kcp->kcp_count, SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME); rc = -ENOMEM; } @@ -995,14 +1002,14 @@ splat_kmem_test9(struct file *file, void *arg) for (i = 0; i < 60; i++) { splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST9_NAME, kcp); - if (kcp->kcp_cache->skc_obj_total == 0) + if (kcp->kcp_count == 0) break; set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); } - if (kcp->kcp_cache->skc_obj_total == 0) { + if (kcp->kcp_count == 0) { splat_vprint(file, SPLAT_KMEM_TEST9_NAME, "Successfully created %d objects " "in cache %s and reclaimed them\n", @@ -1010,7 +1017,7 @@ splat_kmem_test9(struct file *file, void *arg) } else { splat_vprint(file, SPLAT_KMEM_TEST9_NAME, "Failed to reclaim %u/%d objects from cache %s\n", - (unsigned)kcp->kcp_cache->skc_obj_total, count, + (unsigned)kcp->kcp_count, count, SPLAT_KMEM_CACHE_NAME); rc = -ENOMEM; }