Ensure kmem_alloc() and vmem_alloc() never fail

The Solaris semantics for kmem_alloc() and vmem_alloc() are that they
must never fail when called with KM_SLEEP.  They may only fail if
called with KM_NOSLEEP otherwise they must block until memory is
available.  This is quite different from how the Linux memory
allocators work, under Linux a memory allocation failure is always
possible and must be dealt with.

At one point in the past the kmem code did properly implement this
behavior, however as the code evolved this behavior was overlooked
in places.  This patch goes through all three implementations of
the kmem/vmem allocation functions and ensures that they will all
block in the KM_SLEEP case when memory is not available.  They
may still fail in the KM_NOSLEEP case in which case the caller
is responsible for handling the failure.

Special care is taken in vmalloc_nofail() to avoid thrashing the
system on the virtual address space spin lock.  The down side of
course is if you do see a failure here, which is unlikely for
64-bit systems, your allocation will delay for an entire second.
Still this is preferable to locking up your system and it is the
best we can do given the constraints.

Additionally, the code was cleaned up to be much more readable
and comments were added to describe the various kmem-debug-*
configure options.  The default configure options remain:
"--enable-debug-kmem --disable-debug-kmem-tracking"
This commit is contained in:
Brian Behlendorf 2010-07-26 15:47:55 -07:00
parent 849c50e7f2
commit 10129680f8
2 changed files with 285 additions and 189 deletions

View File

@ -87,10 +87,10 @@ kzalloc_nofail(size_t size, gfp_t flags)
return ptr; return ptr;
} }
#ifdef HAVE_KMALLOC_NODE
static inline void * static inline void *
kmalloc_node_nofail(size_t size, gfp_t flags, int node) kmalloc_node_nofail(size_t size, gfp_t flags, int node)
{ {
#ifdef HAVE_KMALLOC_NODE
void *ptr; void *ptr;
do { do {
@ -98,16 +98,63 @@ kmalloc_node_nofail(size_t size, gfp_t flags, int node)
} while (ptr == NULL && (flags & __GFP_WAIT)); } while (ptr == NULL && (flags & __GFP_WAIT));
return ptr; return ptr;
} #else
return kmalloc_nofail(size, flags);
#endif /* HAVE_KMALLOC_NODE */ #endif /* HAVE_KMALLOC_NODE */
}
static inline void *
vmalloc_nofail(size_t size, gfp_t flags)
{
void *ptr;
/*
* Retry failed __vmalloc() allocations once every second. The
* rational for the delay is that the likely failure modes are:
*
* 1) The system has completely exhausted memory, in which case
* delaying 1 second for the memory reclaim to run is reasonable
* to avoid thrashing the system.
* 2) The system has memory but has exhausted the small virtual
* address space available on 32-bit systems. Retrying the
* allocation immediately will only result in spinning on the
* virtual address space lock. It is better delay a second and
* hope that another process will free some of the address space.
* But the bottom line is there is not much we can actually do
* since we can never safely return a failure and honor the
* Solaris semantics.
*/
while (1) {
ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
} else {
break;
}
}
return ptr;
}
static inline void *
vzalloc_nofail(size_t size, gfp_t flags)
{
void *ptr;
ptr = vmalloc_nofail(size, flags);
if (ptr)
memset(ptr, 0, (size));
return ptr;
}
#ifdef DEBUG_KMEM #ifdef DEBUG_KMEM
# ifdef HAVE_ATOMIC64_T
extern atomic64_t kmem_alloc_used; /*
extern unsigned long long kmem_alloc_max; * Memory accounting functions to be used only when DEBUG_KMEM is set.
extern atomic64_t vmem_alloc_used; */
extern unsigned long long vmem_alloc_max; # ifdef HAVE_ATOMIC64_T
# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) # define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) # define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
@ -118,13 +165,13 @@ extern unsigned long long vmem_alloc_max;
# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) # define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) # define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
# else extern atomic64_t kmem_alloc_used;
extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max; extern unsigned long long kmem_alloc_max;
extern atomic_t vmem_alloc_used; extern atomic64_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max; extern unsigned long long vmem_alloc_max;
# else /* HAVE_ATOMIC64_T */
# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) # define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) # define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) # define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
@ -134,90 +181,107 @@ extern unsigned long long vmem_alloc_max;
# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) # define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) # define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
# endif /* _LP64 */ extern atomic_t kmem_alloc_used;
extern unsigned long long kmem_alloc_max;
extern atomic_t vmem_alloc_used;
extern unsigned long long vmem_alloc_max;
# define kmem_alloc(size, flags) __kmem_alloc((size), (flags), 0, 0) # endif /* HAVE_ATOMIC64_T */
# define kmem_zalloc(size, flags) __kmem_alloc((size), ((flags) | \
__GFP_ZERO), 0, 0)
/* The node alloc functions are only used by the SPL code itself */
# ifdef HAVE_KMALLOC_NODE
# define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 1, \
node)
# else
# define kmem_alloc_node(size, flags, node) __kmem_alloc((size), (flags), 0, 0)
# endif
# define vmem_zalloc(size, flags) vmem_alloc((size), ((flags) | \
__GFP_ZERO))
# ifdef DEBUG_KMEM_TRACKING # ifdef DEBUG_KMEM_TRACKING
/*
* DEBUG_KMEM && DEBUG_KMEM_TRACKING
*
* The maximum level of memory debugging. All memory will be accounted
* for and each allocation will be explicitly tracked. Any allocation
* which is leaked will be reported on module unload and the exact location
* where that memory was allocation will be reported. This level of memory
* tracking will have a significant impact on performance and should only
* be enabled for debugging. This feature may be enabled by passing
* --enable-debug-kmem-tracking to configure.
*/
# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
__FUNCTION__, __LINE__, 0, 0)
# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__, 0, 0)
# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
__FUNCTION__, __LINE__, 1, nd)
# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
extern void *kmem_alloc_track(size_t size, int flags, const char *func, # define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
int line, int node_alloc, int node); __FUNCTION__, __LINE__)
extern void kmem_free_track(void *ptr, size_t size); # define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
extern void *vmem_alloc_track(size_t size, int flags, const char *func, __FUNCTION__, __LINE__)
int line); # define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
extern void vmem_free_track(void *ptr, size_t size);
# define __kmem_alloc(size, flags, na, node) kmem_alloc_track((size), \ extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
(flags), __FUNCTION__, \ extern void kmem_free_track(void *, size_t);
__LINE__, (na), (node)) extern void *vmem_alloc_track(size_t, int, const char *, int);
# define kmem_free(ptr, size) kmem_free_track((ptr), (size)) extern void vmem_free_track(void *, size_t);
# define vmem_alloc(size, flags) vmem_alloc_track((size), \
(flags),__FUNCTION__, \
__LINE__)
# define vmem_free(ptr, size) vmem_free_track((ptr), (size))
# else /* DEBUG_KMEM_TRACKING */ # else /* DEBUG_KMEM_TRACKING */
/*
* DEBUG_KMEM && !DEBUG_KMEM_TRACKING
*
* The default build will set DEBUG_KEM. This provides basic memory
* accounting with little to no impact on performance. When the module
* is unloaded in any memory was leaked the total number of leaked bytes
* will be reported on the console. To disable this basic accounting
* pass the --disable-debug-kmem option to configure.
*/
# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
__FUNCTION__, __LINE__, 0, 0)
# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
__FUNCTION__, __LINE__, 0, 0)
# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
__FUNCTION__, __LINE__, 1, nd)
# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
extern void *kmem_alloc_debug(size_t size, int flags, const char *func, # define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
int line, int node_alloc, int node); __FUNCTION__, __LINE__)
extern void kmem_free_debug(void *ptr, size_t size); # define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
extern void *vmem_alloc_debug(size_t size, int flags, const char *func, __FUNCTION__, __LINE__)
int line); # define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
extern void vmem_free_debug(void *ptr, size_t size);
# define __kmem_alloc(size, flags, na, node) kmem_alloc_debug((size), \ extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
(flags), __FUNCTION__, \ extern void kmem_free_debug(void *, size_t);
__LINE__, (na), (node)) extern void *vmem_alloc_debug(size_t, int, const char *, int);
# define kmem_free(ptr, size) kmem_free_debug((ptr), (size)) extern void vmem_free_debug(void *, size_t);
# define vmem_alloc(size, flags) vmem_alloc_debug((size), \
(flags), __FUNCTION__, \
__LINE__)
# define vmem_free(ptr, size) vmem_free_debug((ptr), (size))
# endif /* DEBUG_KMEM_TRACKING */ # endif /* DEBUG_KMEM_TRACKING */
#else /* DEBUG_KMEM */ #else /* DEBUG_KMEM */
/*
* !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
*
* All debugging is disabled. There will be no overhead even for
* minimal memory accounting. To enable basic accounting pass the
* --enable-debug-kmem option to configure.
*/
# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
# define kmem_alloc(size, flags) kmalloc_nofail((size), (flags)) # define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
# define kmem_zalloc(size, flags) kzalloc_nofail((size), (flags)) # define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
# define kmem_free(ptr, size) ((void)(size), kfree(ptr)) # define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
# ifdef HAVE_KMALLOC_NODE
# define kmem_alloc_node(size, flags, node) \
kmalloc_node_nofail((size), (flags), (node))
# else
# define kmem_alloc_node(size, flags, node) \
kmalloc_nofail((size), (flags))
# endif
# define vmem_alloc(size, flags) __vmalloc((size), ((flags) | \
__GFP_HIGHMEM), PAGE_KERNEL)
# define vmem_zalloc(size, flags) \
({ \
void *_ptr_ = __vmalloc((size),((flags)|__GFP_HIGHMEM),PAGE_KERNEL); \
if (_ptr_) \
memset(_ptr_, 0, (size)); \
_ptr_; \
})
# define vmem_free(ptr, size) ((void)(size), vfree(ptr))
#endif /* DEBUG_KMEM */ #endif /* DEBUG_KMEM */
extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
extern char *kmem_asprintf(const char *fmt, ...);
extern char *strdup(const char *str);
extern void strfree(char *str);
/* /*
* Slab allocation interfaces * Slab allocation interfaces. The SPL slab differs from the standard
* Linux SLAB or SLUB primarily in that each cache may be backed by slabs
* allocated from the physical or virtal memory address space. The virtual
* slabs allow for good behavior when allocation large objects of identical
* size. This slab implementation also supports both constructors and
* destructions which the Linux slab does not.
*/ */
enum { enum {
KMC_BIT_NOTOUCH = 0, /* Don't update ages */ KMC_BIT_NOTOUCH = 0, /* Don't update ages */
@ -246,12 +310,6 @@ enum {
#define KMC_REAP_CHUNK INT_MAX #define KMC_REAP_CHUNK INT_MAX
#define KMC_DEFAULT_SEEKS 1 #define KMC_DEFAULT_SEEKS 1
extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
extern char *kmem_asprintf(const char *fmt, ...);
#define strfree(str) kfree(str)
#define strdup(str) kstrdup(str, GFP_KERNEL)
extern struct list_head spl_kmem_cache_list; extern struct list_head spl_kmem_cache_list;
extern struct rw_semaphore spl_kmem_cache_sem; extern struct rw_semaphore spl_kmem_cache_sem;

View File

@ -271,6 +271,34 @@ kmem_asprintf(const char *fmt, ...)
} }
EXPORT_SYMBOL(kmem_asprintf); EXPORT_SYMBOL(kmem_asprintf);
static char *
__strdup(const char *str, int flags)
{
char *ptr;
int n;
n = strlen(str);
ptr = kmalloc_nofail(n + 1, flags);
if (ptr)
memcpy(ptr, str, n + 1);
return ptr;
}
char *
strdup(const char *str)
{
return __strdup(str, KM_SLEEP);
}
EXPORT_SYMBOL(strdup);
void
strfree(char *str)
{
kmem_free(str, strlen(str) + 1);
}
EXPORT_SYMBOL(strfree);
/* /*
* Memory allocation interfaces and debugging for basic kmem_* * Memory allocation interfaces and debugging for basic kmem_*
* and vmem_* style memory allocation. When DEBUG_KMEM is enabled * and vmem_* style memory allocation. When DEBUG_KMEM is enabled
@ -285,12 +313,12 @@ atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long kmem_alloc_max = 0; unsigned long long kmem_alloc_max = 0;
atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long vmem_alloc_max = 0; unsigned long long vmem_alloc_max = 0;
# else # else /* HAVE_ATOMIC64_T */
atomic_t kmem_alloc_used = ATOMIC_INIT(0); atomic_t kmem_alloc_used = ATOMIC_INIT(0);
unsigned long long kmem_alloc_max = 0; unsigned long long kmem_alloc_max = 0;
atomic_t vmem_alloc_used = ATOMIC_INIT(0); atomic_t vmem_alloc_used = ATOMIC_INIT(0);
unsigned long long vmem_alloc_max = 0; unsigned long long vmem_alloc_max = 0;
# endif /* _LP64 */ # endif /* HAVE_ATOMIC64_T */
EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max); EXPORT_SYMBOL(kmem_alloc_max);
@ -340,77 +368,9 @@ EXPORT_SYMBOL(kmem_list);
EXPORT_SYMBOL(vmem_lock); EXPORT_SYMBOL(vmem_lock);
EXPORT_SYMBOL(vmem_table); EXPORT_SYMBOL(vmem_table);
EXPORT_SYMBOL(vmem_list); EXPORT_SYMBOL(vmem_list);
# endif
#endif
/*
* Slab allocation interfaces
*
* While the Linux slab implementation was inspired by the Solaris
* implemenation I cannot use it to emulate the Solaris APIs. I
* require two features which are not provided by the Linux slab.
*
* 1) Constructors AND destructors. Recent versions of the Linux
* kernel have removed support for destructors. This is a deal
* breaker for the SPL which contains particularly expensive
* initializers for mutex's, condition variables, etc. We also
* require a minimal level of cleanup for these data types unlike
* many Linux data type which do need to be explicitly destroyed.
*
* 2) Virtual address space backed slab. Callers of the Solaris slab
* expect it to work well for both small are very large allocations.
* Because of memory fragmentation the Linux slab which is backed
* by kmalloc'ed memory performs very badly when confronted with
* large numbers of large allocations. Basing the slab on the
* virtual address space removes the need for contigeous pages
* and greatly improve performance for large allocations.
*
* For these reasons, the SPL has its own slab implementation with
* the needed features. It is not as highly optimized as either the
* Solaris or Linux slabs, but it should get me most of what is
* needed until it can be optimized or obsoleted by another approach.
*
* One serious concern I do have about this method is the relatively
* small virtual address space on 32bit arches. This will seriously
* constrain the size of the slab caches and their performance.
*
* XXX: Improve the partial slab list by carefully maintaining a
* strict ordering of fullest to emptiest slabs based on
* the slab reference count. This gaurentees the when freeing
* slabs back to the system we need only linearly traverse the
* last N slabs in the list to discover all the freeable slabs.
*
* XXX: NUMA awareness for optionally allocating memory close to a
* particular core. This can be adventageous if you know the slab
* object will be short lived and primarily accessed from one core.
*
* XXX: Slab coloring may also yield performance improvements and would
* be desirable to implement.
*/
struct list_head spl_kmem_cache_list; /* List of caches */
struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
static int spl_cache_flush(spl_kmem_cache_t *skc,
spl_kmem_magazine_t *skm, int flush);
#ifdef HAVE_SET_SHRINKER
static struct shrinker *spl_kmem_cache_shrinker;
#else
static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
unsigned int gfp_mask);
static struct shrinker spl_kmem_cache_shrinker = {
.shrink = spl_kmem_cache_generic_shrinker,
.seeks = KMC_DEFAULT_SEEKS,
};
#endif
#ifdef DEBUG_KMEM
# ifdef DEBUG_KMEM_TRACKING
static kmem_debug_t * static kmem_debug_t *
kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, void *addr)
void *addr)
{ {
struct hlist_head *head; struct hlist_head *head;
struct hlist_node *node; struct hlist_node *node;
@ -444,17 +404,20 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
unsigned long irq_flags; unsigned long irq_flags;
SENTRY; SENTRY;
/* Function may be called with KM_NOSLEEP so failure is possible */
dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
flags & ~__GFP_ZERO); flags & ~__GFP_ZERO);
if (dptr == NULL) { if (unlikely(dptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug " SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
"kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n", "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
sizeof(kmem_debug_t), flags, func, line, sizeof(kmem_debug_t), flags, func, line,
kmem_alloc_used_read(), kmem_alloc_max); kmem_alloc_used_read(), kmem_alloc_max);
} else { } else {
/* Marked unlikely because we should never be doing this, /*
* we tolerate to up 2 pages but a single page is best. */ * Marked unlikely because we should never be doing this,
* we tolerate to up 2 pages but a single page is best.
*/
if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large " SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large "
"kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
@ -463,14 +426,17 @@ kmem_alloc_track(size_t size, int flags, const char *func, int line,
spl_debug_dumpstack(NULL); spl_debug_dumpstack(NULL);
} }
/* We use kstrdup() below because the string pointed to by /*
* We use __strdup() below because the string pointed to by
* __FUNCTION__ might not be available by the time we want * __FUNCTION__ might not be available by the time we want
* to print it since the module might have been unloaded. */ * to print it since the module might have been unloaded.
dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO); * This can only fail in the KM_NOSLEEP case.
*/
dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
if (unlikely(dptr->kd_func == NULL)) { if (unlikely(dptr->kd_func == NULL)) {
kfree(dptr); kfree(dptr);
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"debug kstrdup() at %s:%d failed (%lld/%llu)\n", "debug __strdup() at %s:%d failed (%lld/%llu)\n",
func, line, kmem_alloc_used_read(), kmem_alloc_max); func, line, kmem_alloc_used_read(), kmem_alloc_max);
goto out; goto out;
} }
@ -533,7 +499,8 @@ kmem_free_track(void *ptr, size_t size)
dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
ASSERT(dptr); /* Must exist in hash due to kmem_alloc() */ /* Must exist in hash due to kmem_alloc() */
ASSERT(dptr);
/* Size must match */ /* Size must match */
ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@ -567,28 +534,37 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
ASSERT(flags & KM_SLEEP); ASSERT(flags & KM_SLEEP);
/* Function may be called with KM_NOSLEEP so failure is possible */
dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t),
flags & ~__GFP_ZERO); flags & ~__GFP_ZERO);
if (dptr == NULL) { if (unlikely(dptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug " SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug "
"vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n", "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n",
sizeof(kmem_debug_t), flags, func, line, sizeof(kmem_debug_t), flags, func, line,
vmem_alloc_used_read(), vmem_alloc_max); vmem_alloc_used_read(), vmem_alloc_max);
} else { } else {
/* We use kstrdup() below because the string pointed to by /*
* We use __strdup() below because the string pointed to by
* __FUNCTION__ might not be available by the time we want * __FUNCTION__ might not be available by the time we want
* to print it, since the module might have been unloaded. */ * to print it, since the module might have been unloaded.
dptr->kd_func = kstrdup(func, flags & ~__GFP_ZERO); * This can never fail because we have already asserted
* that flags is KM_SLEEP.
*/
dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
if (unlikely(dptr->kd_func == NULL)) { if (unlikely(dptr->kd_func == NULL)) {
kfree(dptr); kfree(dptr);
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"debug kstrdup() at %s:%d failed (%lld/%llu)\n", "debug __strdup() at %s:%d failed (%lld/%llu)\n",
func, line, vmem_alloc_used_read(), vmem_alloc_max); func, line, vmem_alloc_used_read(), vmem_alloc_max);
goto out; goto out;
} }
ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO, /* Use the correct allocator */
PAGE_KERNEL); if (flags & __GFP_ZERO) {
ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
} else {
ptr = vmalloc_nofail(size, flags);
}
if (unlikely(ptr == NULL)) { if (unlikely(ptr == NULL)) {
kfree(dptr->kd_func); kfree(dptr->kd_func);
@ -600,9 +576,6 @@ vmem_alloc_track(size_t size, int flags, const char *func, int line)
goto out; goto out;
} }
if (flags & __GFP_ZERO)
memset(ptr, 0, size);
vmem_alloc_used_add(size); vmem_alloc_used_add(size);
if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
vmem_alloc_max = vmem_alloc_used_read(); vmem_alloc_max = vmem_alloc_used_read();
@ -640,7 +613,9 @@ vmem_free_track(void *ptr, size_t size)
(unsigned long long) size); (unsigned long long) size);
dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
ASSERT(dptr); /* Must exist in hash due to vmem_alloc() */
/* Must exist in hash due to vmem_alloc() */
ASSERT(dptr);
/* Size must match */ /* Size must match */
ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
@ -673,11 +648,13 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
void *ptr; void *ptr;
SENTRY; SENTRY;
/* Marked unlikely because we should never be doing this, /*
* we tolerate to up 2 pages but a single page is best. */ * Marked unlikely because we should never be doing this,
* we tolerate to up 2 pages but a single page is best.
*/
if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
SDEBUG(SD_CONSOLE | SD_WARNING, SDEBUG(SD_CONSOLE | SD_WARNING,
"Large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
(unsigned long long) size, flags, func, line, (unsigned long long) size, flags, func, line,
kmem_alloc_used_read(), kmem_alloc_max); kmem_alloc_used_read(), kmem_alloc_max);
spl_debug_dumpstack(NULL); spl_debug_dumpstack(NULL);
@ -693,7 +670,7 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
ptr = kmalloc_nofail(size, flags); ptr = kmalloc_nofail(size, flags);
} }
if (ptr == NULL) { if (unlikely(ptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
(unsigned long long) size, flags, func, line, (unsigned long long) size, flags, func, line,
@ -706,8 +683,9 @@ kmem_alloc_debug(size_t size, int flags, const char *func, int line,
SDEBUG_LIMIT(SD_INFO, SDEBUG_LIMIT(SD_INFO,
"kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n", "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n",
(unsigned long long) size, flags, func, line, ptr, (unsigned long long) size, flags, func, line, ptr,
kmem_alloc_used_read(), kmem_alloc_max); kmem_alloc_used_read(), kmem_alloc_max);
} }
SRETURN(ptr); SRETURN(ptr);
} }
EXPORT_SYMBOL(kmem_alloc_debug); EXPORT_SYMBOL(kmem_alloc_debug);
@ -724,8 +702,6 @@ kmem_free_debug(void *ptr, size_t size)
SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr, SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr,
(unsigned long long) size, kmem_alloc_used_read(), (unsigned long long) size, kmem_alloc_used_read(),
kmem_alloc_max); kmem_alloc_max);
memset(ptr, 0x5a, size);
kfree(ptr); kfree(ptr);
SEXIT; SEXIT;
@ -740,17 +716,19 @@ vmem_alloc_debug(size_t size, int flags, const char *func, int line)
ASSERT(flags & KM_SLEEP); ASSERT(flags & KM_SLEEP);
ptr = __vmalloc(size, (flags | __GFP_HIGHMEM) & ~__GFP_ZERO, /* Use the correct allocator */
PAGE_KERNEL); if (flags & __GFP_ZERO) {
if (ptr == NULL) { ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
} else {
ptr = vmalloc_nofail(size, flags);
}
if (unlikely(ptr == NULL)) {
SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING,
"vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
(unsigned long long) size, flags, func, line, (unsigned long long) size, flags, func, line,
vmem_alloc_used_read(), vmem_alloc_max); vmem_alloc_used_read(), vmem_alloc_max);
} else { } else {
if (flags & __GFP_ZERO)
memset(ptr, 0, size);
vmem_alloc_used_add(size); vmem_alloc_used_add(size);
if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
vmem_alloc_max = vmem_alloc_used_read(); vmem_alloc_max = vmem_alloc_used_read();
@ -776,8 +754,6 @@ vmem_free_debug(void *ptr, size_t size)
SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr, SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr,
(unsigned long long) size, vmem_alloc_used_read(), (unsigned long long) size, vmem_alloc_used_read(),
vmem_alloc_max); vmem_alloc_max);
memset(ptr, 0x5a, size);
vfree(ptr); vfree(ptr);
SEXIT; SEXIT;
@ -787,6 +763,68 @@ EXPORT_SYMBOL(vmem_free_debug);
# endif /* DEBUG_KMEM_TRACKING */ # endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */ #endif /* DEBUG_KMEM */
/*
* Slab allocation interfaces
*
* While the Linux slab implementation was inspired by the Solaris
* implemenation I cannot use it to emulate the Solaris APIs. I
* require two features which are not provided by the Linux slab.
*
* 1) Constructors AND destructors. Recent versions of the Linux
* kernel have removed support for destructors. This is a deal
* breaker for the SPL which contains particularly expensive
* initializers for mutex's, condition variables, etc. We also
* require a minimal level of cleanup for these data types unlike
* many Linux data type which do need to be explicitly destroyed.
*
* 2) Virtual address space backed slab. Callers of the Solaris slab
* expect it to work well for both small are very large allocations.
* Because of memory fragmentation the Linux slab which is backed
* by kmalloc'ed memory performs very badly when confronted with
* large numbers of large allocations. Basing the slab on the
* virtual address space removes the need for contigeous pages
* and greatly improve performance for large allocations.
*
* For these reasons, the SPL has its own slab implementation with
* the needed features. It is not as highly optimized as either the
* Solaris or Linux slabs, but it should get me most of what is
* needed until it can be optimized or obsoleted by another approach.
*
* One serious concern I do have about this method is the relatively
* small virtual address space on 32bit arches. This will seriously
* constrain the size of the slab caches and their performance.
*
* XXX: Improve the partial slab list by carefully maintaining a
* strict ordering of fullest to emptiest slabs based on
* the slab reference count. This gaurentees the when freeing
* slabs back to the system we need only linearly traverse the
* last N slabs in the list to discover all the freeable slabs.
*
* XXX: NUMA awareness for optionally allocating memory close to a
* particular core. This can be adventageous if you know the slab
* object will be short lived and primarily accessed from one core.
*
* XXX: Slab coloring may also yield performance improvements and would
* be desirable to implement.
*/
struct list_head spl_kmem_cache_list; /* List of caches */
struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
static int spl_cache_flush(spl_kmem_cache_t *skc,
spl_kmem_magazine_t *skm, int flush);
#ifdef HAVE_SET_SHRINKER
static struct shrinker *spl_kmem_cache_shrinker;
#else
static int spl_kmem_cache_generic_shrinker(int nr_to_scan,
unsigned int gfp_mask);
static struct shrinker spl_kmem_cache_shrinker = {
.shrink = spl_kmem_cache_generic_shrinker,
.seeks = KMC_DEFAULT_SEEKS,
};
#endif
static void * static void *
kv_alloc(spl_kmem_cache_t *skc, int size, int flags) kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
{ {