diff --git a/include/sys/kmem.h b/include/sys/kmem.h index a9d94c909..045d07c2c 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -26,6 +26,7 @@ #define _SPL_KMEM_H #include +#include extern int kmem_debugging(void); extern char *kmem_vasprintf(const char *fmt, va_list ap); @@ -36,68 +37,41 @@ extern void strfree(char *str); /* * Memory allocation interfaces */ -#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ -#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ -#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ -#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ -#define KM_FLAGS __GFP_BITS_MASK -#define KM_VMFLAGS GFP_LEVEL_MASK +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ + +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) /* - * Used internally, the kernel does not need to support this flag + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. */ -#ifndef __GFP_ZERO -#define __GFP_ZERO 0x8000 -#endif - -/* - * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as - * early as 2.6.32. To avoid this issue when it occurs in upstream kernels - * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC). - * I would prefer the caller handle the failure case cleanly but we are - * trying to emulate Solaris and those are not the Solaris semantics. - */ -static inline void * -kmalloc_nofail(size_t size, gfp_t flags) +static inline gfp_t +kmem_flags_convert(int flags) { - void *ptr; + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; - do { - ptr = kmalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if ((current->flags & PF_FSTRANS)) + lflags &= ~(__GFP_IO|__GFP_FS); + } - return (ptr); + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + + return (lflags); } -static inline void * -kzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - do { - ptr = kzalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); - - return (ptr); -} - -static inline void * -kmalloc_node_nofail(size_t size, gfp_t flags, int node) -{ - void *ptr; - - do { - ptr = kmalloc_node(size, flags, node); - } while (ptr == NULL && (flags & __GFP_WAIT)); - - return (ptr); -} - -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) @@ -114,70 +88,29 @@ extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; #endif /* HAVE_ATOMIC64_T */ -#ifdef DEBUG_KMEM_TRACKING +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; + +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) + +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); + /* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING - * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. + * The following functions are only available for internal use. */ -#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) - -extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); -extern void kmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. - */ -#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) -#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) -#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) -#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) - -#endif /* DEBUG_KMEM */ - -int spl_kmem_init(void); -void spl_kmem_fini(void); - -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h index a5bc0322b..a9b5bdd2f 100644 --- a/include/sys/kmem_cache.h +++ b/include/sys/kmem_cache.h @@ -202,6 +202,7 @@ extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); extern void spl_kmem_reap(void); @@ -214,29 +215,6 @@ extern void spl_kmem_reap(void); #define kmem_cache_reap_now(skc) \ spl_kmem_cache_reap_now(skc, skc->skc_reap) #define kmem_reap() spl_kmem_reap() -#define kmem_virt(ptr) \ - (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) - -/* - * Allow custom slab allocation flags to be set for KMC_SLAB based caches. - * One use for this function is to ensure the __GFP_COMP flag is part of - * the default allocation mask which ensures higher order allocations are - * properly refcounted. This flag was added to the default ->allocflags - * as of Linux 3.11. - */ -static inline void -kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) -{ - if (skc->skc_linux_cache == NULL) - return; - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= flags; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= flags; -#endif -} /* * The following functions are only available for internal use. diff --git a/include/sys/vmem.h b/include/sys/vmem.h index f59ac5e8b..6eb2c6769 100644 --- a/include/sys/vmem.h +++ b/include/sys/vmem.h @@ -47,135 +47,60 @@ extern size_t vmem_size(vmem_t *vmp, int typemask); #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) #endif -static inline void * -vmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - /* - * Retry failed __vmalloc() allocations once every second. The - * rational for the delay is that the likely failure modes are: - * - * 1) The system has completely exhausted memory, in which case - * delaying 1 second for the memory reclaim to run is reasonable - * to avoid thrashing the system. - * 2) The system has memory but has exhausted the small virtual - * address space available on 32-bit systems. Retrying the - * allocation immediately will only result in spinning on the - * virtual address space lock. It is better delay a second and - * hope that another process will free some of the address space. - * But the bottom line is there is not much we can actually do - * since we can never safely return a failure and honor the - * Solaris semantics. - */ - while (1) { - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } else { - break; - } - } - - return (ptr); -} - -static inline void * -vzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - ptr = vmalloc_nofail(size, flags); - if (ptr) - memset(ptr, 0, (size)); - - return (ptr); -} - -#ifdef DEBUG_KMEM - /* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ -#ifdef HAVE_ATOMIC64_T - -#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) - -extern atomic64_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#else /* HAVE_ATOMIC64_T */ - -#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) - -extern atomic_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -#endif /* HAVE_ATOMIC64_T */ - -#ifdef DEBUG_KMEM_TRACKING -/* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) - -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); -extern void *vmem_alloc_track(size_t, int, const char *, int); -extern void vmem_free_track(const void *, size_t); - -#else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) - -extern void *vmem_alloc_debug(size_t, int, const char *, int); -extern void vmem_free_debug(const void *, size_t); - -#endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. + * + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. */ -#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) -#endif /* DEBUG_KMEM */ +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); void spl_vmem_fini(void); diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 33e10b53c..2ec5b668e 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -80,6 +80,46 @@ By age (0x1) or low memory (0x2) Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBspl_kmem_alloc_warn\fR (uint) +.ad +.RS 12n +As a general rule kmem_alloc() allocations should be small, preferably +just a few pages since they must by physically contiguous. Therefore, a +rate limited warning will be printed to the console for any kmem_alloc() +which exceeds a reasonable threshold. + +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. This value was selected to be small +enough to ensure the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. Since this value is tunable, +developers are encouraged to set it lower when testing so any new largish +allocations are quickly caught. These warnings may be disabled by setting +the threshold to zero. +.sp +Default value: \fB32K\fR. +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_max\fR (uint) +.ad +.RS 12n +Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. Therefore, a maximum kmem size with reasonable safely +margin of 4x is set. Kmem_alloc() allocations larger than this maximum +will quickly fail. Vmem_alloc() allocations less than or equal to this +value will use kmalloc(), but shift to vmalloc() when exceeding this value. +.sp +Default value: \fBKMALLOC_MAX_SIZE/4\fR. +.RE + .sp .ne 2 .na diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 3aa65a9bf..9a8ccfe42 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -130,19 +130,6 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. - * - * XXX: Improve the partial slab list by carefully maintaining a - * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees that when freeing - * slabs back to the system we need only linearly traverse the - * last N slabs in the list to discover all the freeable slabs. - * - * XXX: NUMA awareness for optionally allocating memory close to a - * particular core. This can be advantageous if you know the slab - * object will be short lived and primarily accessed from one core. - * - * XXX: Slab coloring may also yield performance improvements and would - * be desirable to implement. */ struct list_head spl_kmem_cache_list; /* List of caches */ @@ -158,15 +145,15 @@ SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { + gfp_t lflags = kmem_flags_convert(flags); void *ptr; ASSERT(ISP2(size)); if (skc->skc_flags & KMC_KMEM) - ptr = (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); else - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -361,12 +348,11 @@ spl_slab_free(spl_kmem_slab_t *sks, } /* - * Traverse all the partial slabs attached to a cache and free those - * which which are currently empty, and have not been touched for - * skc_delay seconds to avoid thrashing. The count argument is - * passed to optionally cap the number of slabs reclaimed, a count - * of zero means try and reclaim everything. When flag is set we - * always free an available slab regardless of age. + * Traverse all the partial slabs attached to a cache and free those which + * are currently empty, and have not been touched for skc_delay seconds to + * avoid thrashing. The count argument is passed to optionally cap the + * number of slabs reclaimed, a count of zero means try and reclaim + * everything. When flag the is set available slabs freed regardless of age. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) @@ -480,6 +466,7 @@ spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { + gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int empty; @@ -490,11 +477,11 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); - ske = kmalloc(sizeof (*ske), flags); + ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); - ske->ske_obj = kmalloc(skc->skc_obj_size, flags); + ske->ske_obj = kmalloc(skc->skc_obj_size, lflags); if (ske->ske_obj == NULL) { kfree(ske); return (-ENOMEM); @@ -734,7 +721,7 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; - skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; @@ -754,13 +741,9 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) static void spl_magazine_free(spl_kmem_magazine_t *skm) { - int size = sizeof (spl_kmem_magazine_t) + - sizeof (void *) * skm->skm_size; - ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); - - kmem_free(skm, size); + kfree(skm); } /* @@ -835,6 +818,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags) { + gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; @@ -852,18 +836,17 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, * Allocate memory for a new cache and initialize it. Unfortunately, * this usually ends up being a large allocation of ~32k because * we need to allocate enough memory for the worst case number of - * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we - * explicitly pass KM_NODEBUG to suppress the kmem warning + * cpus in the magazine, skc_mag[NR_CPUS]. */ - skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG); + skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { - kmem_free(skc, sizeof (*skc)); + kfree(skc); return (NULL); } strncpy(skc->skc_name, name, skc->skc_name_size); @@ -962,7 +945,11 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, goto out; } - kmem_cache_set_allocflags(skc, __GFP_COMP); +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif skc->skc_flags |= KMC_NOMAGAZINE; } @@ -977,8 +964,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, return (skc); out: - kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); @@ -1048,10 +1035,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); - kmem_free(skc->skc_name, skc->skc_name_size); spin_unlock(&skc->skc_lock); - kmem_free(skc, sizeof (*skc)); + kfree(skc->skc_name); + kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); @@ -1106,7 +1093,13 @@ spl_cache_grow_work(void *data) spl_kmem_cache_t *skc = ska->ska_cache; spl_kmem_slab_t *sks; - sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); +#if defined(PF_MEMALLOC_NOIO) + unsigned noio_flag = memalloc_noio_save(); + sks = spl_slab_alloc(skc, ska->ska_flags); + memalloc_noio_restore(noio_flag); +#else + sks = spl_slab_alloc(skc, ska->ska_flags); +#endif spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; @@ -1140,8 +1133,9 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc) static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { - int remaining, rc; + int remaining, rc = 0; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); @@ -1166,7 +1160,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; - ska = kmalloc(sizeof (*ska), flags); + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit(KMC_BIT_GROWING, &skc->skc_flags); wake_up_all(&skc->skc_waitq); @@ -1175,7 +1169,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) atomic_inc(&skc->skc_ref); ska->ska_cache = skc; - ska->ska_flags = flags & ~__GFP_FS; + ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); @@ -1347,9 +1341,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) spl_kmem_magazine_t *skm; void *obj = NULL; + ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - ASSERT(flags & KM_SLEEP); atomic_inc(&skc->skc_ref); @@ -1360,9 +1354,8 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; - do { - obj = kmem_cache_alloc(slc, flags | __GFP_COMP); + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); goto ret; @@ -1445,7 +1438,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) * are guaranteed to have physical addresses. They must be removed * from the tree of emergency objects and the freed. */ - if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { + if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) { spl_emergency_free(skc, obj); goto out; } diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 96ad2b043..4cd7cdbee 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -23,8 +23,47 @@ */ #include +#include #include #include +#include +#include + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to eight pages but capped at 32K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); int kmem_debugging(void) @@ -72,7 +111,7 @@ __strdup(const char *str, int flags) int n; n = strlen(str); - ptr = kmalloc_nofail(n + 1, flags); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); if (ptr) memcpy(ptr, str, n + 1); @@ -94,10 +133,101 @@ strfree(char *str) EXPORT_SYMBOL(strfree); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if (unlikely(size > spl_kmem_alloc_max)) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem */ #ifdef DEBUG_KMEM @@ -113,6 +243,28 @@ unsigned long long kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + /* * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is @@ -124,9 +276,14 @@ EXPORT_SYMBOL(kmem_alloc_max); * contended particularly on xfree(). If we want to run with this detailed * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking */ #ifdef DEBUG_KMEM_TRACKING +#include +#include + #define KMEM_HASH_BITS 10 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) @@ -139,13 +296,9 @@ typedef struct kmem_debug { int kd_line; /* Allocation line */ } kmem_debug_t; -spinlock_t kmem_lock; -struct hlist_head kmem_table[KMEM_TABLE_SIZE]; -struct list_head kmem_list; - -EXPORT_SYMBOL(kmem_lock); -EXPORT_SYMBOL(kmem_table); -EXPORT_SYMBOL(kmem_list); +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; static kmem_debug_t * kmem_del_init(spinlock_t *lock, struct hlist_head *table, @@ -174,177 +327,113 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, return (NULL); } -void * -kmem_alloc_track(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) { void *ptr = NULL; kmem_debug_t *dptr; unsigned long irq_flags; - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d " - "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags, - func, line, kmem_alloc_used_read(), kmem_alloc_max); - } else { - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it since the module might have been unloaded. - * This can only fail in the KM_NOSLEEP case. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = kmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } - - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&kmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &kmem_list); - spin_unlock_irqrestore(&kmem_lock, irq_flags); + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); } -out: + + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); + return (ptr); } -EXPORT_SYMBOL(kmem_alloc_track); -void -kmem_free_track(const void *ptr, size_t size) +inline void +spl_kmem_free_track(const void *ptr, size_t size) { kmem_debug_t *dptr; - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - /* Must exist in hash due to kmem_alloc() */ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); - ASSERT(dptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - kmem_alloc_used_sub(size); kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); kfree(dptr); - memset((void *)ptr, 0x5a, size); - kfree(ptr); + spl_kmem_free_debug(ptr, size); } -EXPORT_SYMBOL(kmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ - -void * -kmem_alloc_debug(size_t size, int flags, const char *func, int line, - int node_alloc, int node) -{ - void *ptr; - - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING - "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = kmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - } else { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - } - - return (ptr); -} -EXPORT_SYMBOL(kmem_alloc_debug); - -void -kmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - kmem_alloc_used_sub(size); - kfree(ptr); -} -EXPORT_SYMBOL(kmem_free_debug); - #endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ +void * +spl_kmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); + +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_ZERO; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_zalloc); + +void +spl_kmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_kmem_free); + #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) static char * spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) @@ -424,22 +513,20 @@ spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) spin_unlock_irqrestore(lock, flags); } -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int spl_kmem_init(void) { - int rc = 0; - #ifdef DEBUG_KMEM kmem_alloc_used_set(0); - spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); -#endif - return (rc); +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + + return (0); } void @@ -454,8 +541,10 @@ spl_kmem_fini(void) */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); +#ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ } diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index e5712aee0..a434ef54f 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -353,26 +353,6 @@ static struct ctl_table spl_kmem_table[] = { .mode = 0444, .proc_handler = &proc_doulongvec_minmax, }, - { - .procname = "vmem_used", - .data = &vmem_alloc_used, -# ifdef HAVE_ATOMIC64_T - .maxlen = sizeof(atomic64_t), -# else - .maxlen = sizeof(atomic_t), -# endif /* HAVE_ATOMIC64_T */ - .mode = 0444, - .proc_handler = &proc_domemused, - }, - { - .procname = "vmem_max", - .data = &vmem_alloc_max, - .maxlen = sizeof(unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, { .procname = "slab_kmem_total", .data = (void *)(KMC_KMEM | KMC_TOTAL), diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c index f4f81048c..9a0987527 100644 --- a/module/spl/spl-tsd.c +++ b/module/spl/spl-tsd.c @@ -337,8 +337,7 @@ tsd_hash_table_init(uint_t bits) if (table == NULL) return (NULL); - table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, - KM_SLEEP | KM_NODEBUG); + table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP); if (table->ht_bins == NULL) { kmem_free(table, sizeof(tsd_hash_table_t)); return (NULL); diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c index 51aef941b..e177988a7 100644 --- a/module/spl/spl-vmem.c +++ b/module/spl/spl-vmem.c @@ -24,6 +24,7 @@ #include #include +#include #include vmem_t *heap_arena = NULL; @@ -47,314 +48,62 @@ vmem_size(vmem_t *vmp, int typemask) EXPORT_SYMBOL(vmem_size); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. */ -#ifdef DEBUG_KMEM +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); -/* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T -atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long vmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t vmem_alloc_used = ATOMIC_INIT(0); -unsigned long long vmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ + flags |= KM_VMEM; -EXPORT_SYMBOL(vmem_alloc_used); -EXPORT_SYMBOL(vmem_alloc_max); - -/* - * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked - * but also the location of every alloc and free. When the SPL module is - * unloaded a list of all leaked addresses and where they were allocated - * will be dumped to the console. Enabling this feature has a significant - * impact on performance but it makes finding memory leaks straight forward. - * - * Not surprisingly with debugging enabled the xmem_locks are very highly - * contended particularly on xfree(). If we want to run with this detailed - * debugging enabled for anything other than debugging we need to minimize - * the contention by moving to a lock per xmem_table entry model. - */ -#ifdef DEBUG_KMEM_TRACKING - -#define VMEM_HASH_BITS 10 -#define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) - -typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ -} kmem_debug_t; - -spinlock_t vmem_lock; -struct hlist_head vmem_table[VMEM_TABLE_SIZE]; -struct list_head vmem_list; - -EXPORT_SYMBOL(vmem_lock); -EXPORT_SYMBOL(vmem_table); -EXPORT_SYMBOL(vmem_list); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); void * -vmem_alloc_track(size_t size, int flags, const char *func, int line) +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) { - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; + ASSERT0(flags & ~KM_PUBLIC_MASK); - ASSERT(flags & KM_SLEEP); + flags |= (KM_VMEM | KM_ZERO); - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t), - flags & ~__GFP_ZERO); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - sizeof (kmem_debug_t), flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - } else { - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it, since the module might have been unloaded. - * This can never fail because we have already asserted - * that flags is KM_SLEEP. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&vmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &vmem_list); - spin_unlock_irqrestore(&vmem_lock, irq_flags); - } -out: - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_alloc_track); +EXPORT_SYMBOL(spl_vmem_zalloc); void -vmem_free_track(const void *ptr, size_t size) +spl_vmem_free(const void *buf, size_t size) { - kmem_debug_t *dptr; - - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - - /* Must exist in hash due to vmem_alloc() */ - dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); - ASSERT(dptr); - - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - vmem_alloc_used_sub(size); - kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof (kmem_debug_t)); - kfree(dptr); - - memset((void *)ptr, 0x5a, size); - vfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -EXPORT_SYMBOL(vmem_free_track); - -#else /* DEBUG_KMEM_TRACKING */ - -void * -vmem_alloc_debug(size_t size, int flags, const char *func, int line) -{ - void *ptr; - - ASSERT(flags & KM_SLEEP); - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); - } else { - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - } - - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_debug); - -void -vmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - vmem_alloc_used_sub(size); - vfree(ptr); -} -EXPORT_SYMBOL(vmem_free_debug); - -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* - * Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. - */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* - * Minimum number of printable characters found - * to make it worthwhile to print this as ascii. - */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return (str); -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) -{ - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(lock, flags); -} -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_vmem_free); int spl_vmem_init(void) { - int rc = 0; - -#ifdef DEBUG_KMEM - vmem_alloc_used_set(0); - spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); -#endif - - return (rc); + return (0); } void spl_vmem_fini(void) { -#ifdef DEBUG_KMEM - /* - * Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. - */ - if (vmem_alloc_used_read() != 0) - printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", - vmem_alloc_used_read(), vmem_alloc_max); - - spl_kmem_fini_tracking(&vmem_list, &vmem_lock); -#endif /* DEBUG_KMEM */ } diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index 7edc85990..81f748bb6 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -95,11 +95,11 @@ splat_kmem_test1(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_alloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -127,11 +127,11 @@ splat_kmem_test2(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_zalloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -171,7 +171,11 @@ splat_kmem_test3(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_alloc() size to ensure both + * the kmem_alloc() and vmem_alloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { @@ -203,7 +207,11 @@ splat_kmem_test4(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_zalloc() size to ensure both + * the kmem_zalloc() and vmem_zalloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) {