mirror_zfs/module/os/linux/spl/spl-kmem.c
Matthew Ahrens bbcec73783
kmem_alloc(KM_SLEEP) should use kvmalloc()
`kmem_alloc(size>PAGESIZE, KM_SLEEP)` is backed by `kmalloc()`, which
finds contiguous physical memory.  If there isn't enough contiguous
physical memory available (e.g. due to physical page fragmentation), the
OOM killer will be invoked to make more memory available.  This is not
ideal because processes may be killed when there is still plenty of free
memory (it just happens to be in individual pages, not contiguous runs
of pages).  We have observed this when allocating the ~13KB `zfs_cmd_t`,
for example in `zfsdev_ioctl()`.

This commit changes the behavior of
`kmem_alloc(size>PAGESIZE, KM_SLEEP)` when there are insufficient
contiguous free pages.  In this case we will find individual pages and
stitch them together using virtual memory.  This is accomplished by
using `kvmalloc()`, which implements the described behavior by trying
`kmalloc(__GFP_NORETRY)` and falling back on `vmalloc()`.

The behavior of `kmem_alloc(KM_NOSLEEP)` is not changed; it continues to
use `kmalloc(GPF_ATOMIC | __GFP_NORETRY)`.  This is because `vmalloc()`
may sleep.

Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11461
2021-04-06 12:44:54 -07:00

632 lines
17 KiB
C

/*
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Brian Behlendorf <behlendorf1@llnl.gov>.
* UCRL-CODE-235197
*
* This file is part of the SPL, Solaris Porting Layer.
*
* The SPL is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* The SPL is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/vmem.h>
/*
* As a general rule kmem_alloc() allocations should be small, preferably
* just a few pages since they must by physically contiguous. Therefore, a
* rate limited warning will be printed to the console for any kmem_alloc()
* which exceeds a reasonable threshold.
*
* The default warning threshold is set to sixteen pages but capped at 64K to
* accommodate systems using large pages. This value was selected to be small
* enough to ensure the largest allocations are quickly noticed and fixed.
* But large enough to avoid logging any warnings when a allocation size is
* larger than optimal but not a serious concern. Since this value is tunable,
* developers are encouraged to set it lower when testing so any new largish
* allocations are quickly caught. These warnings may be disabled by setting
* the threshold to zero.
*/
/* BEGIN CSTYLED */
unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
module_param(spl_kmem_alloc_warn, uint, 0644);
MODULE_PARM_DESC(spl_kmem_alloc_warn,
"Warning threshold in bytes for a kmem_alloc()");
EXPORT_SYMBOL(spl_kmem_alloc_warn);
/*
* Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
* Allocations which are marginally smaller than this limit may succeed but
* should still be avoided due to the expense of locating a contiguous range
* of free pages. Therefore, a maximum kmem size with reasonable safely
* margin of 4x is set. Kmem_alloc() allocations larger than this maximum
* will quickly fail. Vmem_alloc() allocations less than or equal to this
* value will use kmalloc(), but shift to vmalloc() when exceeding this value.
*/
unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
module_param(spl_kmem_alloc_max, uint, 0644);
MODULE_PARM_DESC(spl_kmem_alloc_max,
"Maximum size in bytes for a kmem_alloc()");
EXPORT_SYMBOL(spl_kmem_alloc_max);
/* END CSTYLED */
int
kmem_debugging(void)
{
return (0);
}
EXPORT_SYMBOL(kmem_debugging);
char *
kmem_vasprintf(const char *fmt, va_list ap)
{
va_list aq;
char *ptr;
do {
va_copy(aq, ap);
ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
va_end(aq);
} while (ptr == NULL);
return (ptr);
}
EXPORT_SYMBOL(kmem_vasprintf);
char *
kmem_asprintf(const char *fmt, ...)
{
va_list ap;
char *ptr;
do {
va_start(ap, fmt);
ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
va_end(ap);
} while (ptr == NULL);
return (ptr);
}
EXPORT_SYMBOL(kmem_asprintf);
static char *
__strdup(const char *str, int flags)
{
char *ptr;
int n;
n = strlen(str);
ptr = kmalloc(n + 1, kmem_flags_convert(flags));
if (ptr)
memcpy(ptr, str, n + 1);
return (ptr);
}
char *
kmem_strdup(const char *str)
{
return (__strdup(str, KM_SLEEP));
}
EXPORT_SYMBOL(kmem_strdup);
void
kmem_strfree(char *str)
{
kfree(str);
}
EXPORT_SYMBOL(kmem_strfree);
void *
spl_kvmalloc(size_t size, gfp_t lflags)
{
#ifdef HAVE_KVMALLOC
/*
* GFP_KERNEL allocations can safely use kvmalloc which may
* improve performance by avoiding a) high latency caused by
* vmalloc's on-access allocation, b) performance loss due to
* MMU memory address mapping and c) vmalloc locking overhead.
* This has the side-effect that the slab statistics will
* incorrectly report this as a vmem allocation, but that is
* purely cosmetic.
*/
if ((lflags & GFP_KERNEL) == GFP_KERNEL)
return (kvmalloc(size, lflags));
#endif
gfp_t kmalloc_lflags = lflags;
if (size > PAGE_SIZE) {
/*
* We need to set __GFP_NOWARN here since spl_kvmalloc is not
* only called by spl_kmem_alloc_impl but can be called
* directly with custom lflags, too. In that case
* kmem_flags_convert does not get called, which would
* implicitly set __GFP_NOWARN.
*/
kmalloc_lflags |= __GFP_NOWARN;
/*
* N.B. __GFP_RETRY_MAYFAIL is supported only for large
* e (>32kB) allocations.
*
* We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
* for !costly requests because there is no other way to tell
* the allocator that we want to fail rather than retry
* endlessly.
*/
if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
(size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
kmalloc_lflags |= __GFP_NORETRY;
}
}
/*
* We first try kmalloc - even for big sizes - and fall back to
* spl_vmalloc if that fails.
*
* For non-__GFP-RECLAIM allocations we always stick to
* kmalloc_node, and fail when kmalloc is not successful (returns
* NULL).
* We cannot fall back to spl_vmalloc in this case because spl_vmalloc
* internally uses GPF_KERNEL allocations.
*/
void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
if (ptr || size <= PAGE_SIZE ||
(lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
return (ptr);
}
return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
}
/*
* General purpose unified implementation of kmem_alloc(). It is an
* amalgamation of Linux and Illumos allocator design. It should never be
* exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
* relatively portable. Consumers may only access this function through
* wrappers that enforce the common flags to ensure portability.
*/
inline void *
spl_kmem_alloc_impl(size_t size, int flags, int node)
{
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
/*
* Log abnormally large allocations and rate limit the console output.
* Allocations larger than spl_kmem_alloc_warn should be performed
* through the vmem_alloc()/vmem_zalloc() interfaces.
*/
if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
!(flags & KM_VMEM)) {
printk(KERN_WARNING
"Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
"https://github.com/openzfs/zfs/issues/new\n",
(unsigned long)size, flags);
dump_stack();
}
/*
* Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
* unlike kmem_alloc() with KM_SLEEP on Illumos.
*/
do {
/*
* Calling kmalloc_node() when the size >= spl_kmem_alloc_max
* is unsafe. This must fail for all for kmem_alloc() and
* kmem_zalloc() callers.
*
* For vmem_alloc() and vmem_zalloc() callers it is permissible
* to use spl_vmalloc(). However, in general use of
* spl_vmalloc() is strongly discouraged because a global lock
* must be acquired. Contention on this lock can significantly
* impact performance so frequently manipulating the virtual
* address space is strongly discouraged.
*/
if (size > spl_kmem_alloc_max) {
if (flags & KM_VMEM) {
ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
} else {
return (NULL);
}
} else {
/*
* We use kmalloc when doing kmem_alloc(KM_NOSLEEP),
* because kvmalloc/vmalloc may sleep. We also use
* kmalloc on systems with limited kernel VA space (e.g.
* 32-bit), which have HIGHMEM. Otherwise we use
* kvmalloc, which tries to get contiguous physical
* memory (fast, like kmalloc) and falls back on using
* virtual memory to stitch together pages (slow, like
* vmalloc).
*/
#ifdef CONFIG_HIGHMEM
if (flags & KM_VMEM) {
#else
if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) {
#endif
ptr = spl_kvmalloc(size, lflags);
} else {
ptr = kmalloc_node(size, lflags, node);
}
}
if (likely(ptr) || (flags & KM_NOSLEEP))
return (ptr);
/*
* Try hard to satisfy the allocation. However, when progress
* cannot be made, the allocation is allowed to fail.
*/
if ((lflags & GFP_KERNEL) == GFP_KERNEL)
lflags |= __GFP_RETRY_MAYFAIL;
/*
* Use cond_resched() instead of congestion_wait() to avoid
* deadlocking systems where there are no block devices.
*/
cond_resched();
} while (1);
return (NULL);
}
inline void
spl_kmem_free_impl(const void *buf, size_t size)
{
if (is_vmalloc_addr(buf))
vfree(buf);
else
kfree(buf);
}
/*
* Memory allocation and accounting for kmem_* * style allocations. When
* DEBUG_KMEM is enabled the total memory allocated will be tracked and
* any memory leaked will be reported during module unload.
*
* ./configure --enable-debug-kmem
*/
#ifdef DEBUG_KMEM
/* Shim layer memory accounting */
#ifdef HAVE_ATOMIC64_T
atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long kmem_alloc_max = 0;
#else /* HAVE_ATOMIC64_T */
atomic_t kmem_alloc_used = ATOMIC_INIT(0);
unsigned long long kmem_alloc_max = 0;
#endif /* HAVE_ATOMIC64_T */
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
inline void *
spl_kmem_alloc_debug(size_t size, int flags, int node)
{
void *ptr;
ptr = spl_kmem_alloc_impl(size, flags, node);
if (ptr) {
kmem_alloc_used_add(size);
if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
kmem_alloc_max = kmem_alloc_used_read();
}
return (ptr);
}
inline void
spl_kmem_free_debug(const void *ptr, size_t size)
{
kmem_alloc_used_sub(size);
spl_kmem_free_impl(ptr, size);
}
/*
* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
* but also the location of every alloc and free. When the SPL module is
* unloaded a list of all leaked addresses and where they were allocated
* will be dumped to the console. Enabling this feature has a significant
* impact on performance but it makes finding memory leaks straight forward.
*
* Not surprisingly with debugging enabled the xmem_locks are very highly
* contended particularly on xfree(). If we want to run with this detailed
* debugging enabled for anything other than debugging we need to minimize
* the contention by moving to a lock per xmem_table entry model.
*
* ./configure --enable-debug-kmem-tracking
*/
#ifdef DEBUG_KMEM_TRACKING
#include <linux/hash.h>
#include <linux/ctype.h>
#define KMEM_HASH_BITS 10
#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
typedef struct kmem_debug {
struct hlist_node kd_hlist; /* Hash node linkage */
struct list_head kd_list; /* List of all allocations */
void *kd_addr; /* Allocation pointer */
size_t kd_size; /* Allocation size */
const char *kd_func; /* Allocation function */
int kd_line; /* Allocation line */
} kmem_debug_t;
static spinlock_t kmem_lock;
static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
static struct list_head kmem_list;
static kmem_debug_t *
kmem_del_init(spinlock_t *lock, struct hlist_head *table,
int bits, const void *addr)
{
struct hlist_head *head;
struct hlist_node *node = NULL;
struct kmem_debug *p;
unsigned long flags;
spin_lock_irqsave(lock, flags);
head = &table[hash_ptr((void *)addr, bits)];
hlist_for_each(node, head) {
p = list_entry(node, struct kmem_debug, kd_hlist);
if (p->kd_addr == addr) {
hlist_del_init(&p->kd_hlist);
list_del_init(&p->kd_list);
spin_unlock_irqrestore(lock, flags);
return (p);
}
}
spin_unlock_irqrestore(lock, flags);
return (NULL);
}
inline void *
spl_kmem_alloc_track(size_t size, int flags,
const char *func, int line, int node)
{
void *ptr = NULL;
kmem_debug_t *dptr;
unsigned long irq_flags;
dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
if (dptr == NULL)
return (NULL);
dptr->kd_func = __strdup(func, flags);
if (dptr->kd_func == NULL) {
kfree(dptr);
return (NULL);
}
ptr = spl_kmem_alloc_debug(size, flags, node);
if (ptr == NULL) {
kfree(dptr->kd_func);
kfree(dptr);
return (NULL);
}
INIT_HLIST_NODE(&dptr->kd_hlist);
INIT_LIST_HEAD(&dptr->kd_list);
dptr->kd_addr = ptr;
dptr->kd_size = size;
dptr->kd_line = line;
spin_lock_irqsave(&kmem_lock, irq_flags);
hlist_add_head(&dptr->kd_hlist,
&kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
list_add_tail(&dptr->kd_list, &kmem_list);
spin_unlock_irqrestore(&kmem_lock, irq_flags);
return (ptr);
}
inline void
spl_kmem_free_track(const void *ptr, size_t size)
{
kmem_debug_t *dptr;
/* Ignore NULL pointer since we haven't tracked it at all */
if (ptr == NULL)
return;
/* Must exist in hash due to kmem_alloc() */
dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
ASSERT3P(dptr, !=, NULL);
ASSERT3S(dptr->kd_size, ==, size);
kfree(dptr->kd_func);
kfree(dptr);
spl_kmem_free_debug(ptr, size);
}
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
/*
* Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
*/
void *
spl_kmem_alloc(size_t size, int flags, const char *func, int line)
{
ASSERT0(flags & ~KM_PUBLIC_MASK);
#if !defined(DEBUG_KMEM)
return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
#else
return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
#endif
}
EXPORT_SYMBOL(spl_kmem_alloc);
void *
spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
{
ASSERT0(flags & ~KM_PUBLIC_MASK);
flags |= KM_ZERO;
#if !defined(DEBUG_KMEM)
return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
#else
return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
#endif
}
EXPORT_SYMBOL(spl_kmem_zalloc);
void
spl_kmem_free(const void *buf, size_t size)
{
#if !defined(DEBUG_KMEM)
return (spl_kmem_free_impl(buf, size));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_free_debug(buf, size));
#else
return (spl_kmem_free_track(buf, size));
#endif
}
EXPORT_SYMBOL(spl_kmem_free);
#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
static char *
spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
{
int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
int i, flag = 1;
ASSERT(str != NULL && len >= 17);
memset(str, 0, len);
/*
* Check for a fully printable string, and while we are at
* it place the printable characters in the passed buffer.
*/
for (i = 0; i < size; i++) {
str[i] = ((char *)(kd->kd_addr))[i];
if (isprint(str[i])) {
continue;
} else {
/*
* Minimum number of printable characters found
* to make it worthwhile to print this as ascii.
*/
if (i > min)
break;
flag = 0;
break;
}
}
if (!flag) {
sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
*((uint8_t *)kd->kd_addr),
*((uint8_t *)kd->kd_addr + 2),
*((uint8_t *)kd->kd_addr + 4),
*((uint8_t *)kd->kd_addr + 6),
*((uint8_t *)kd->kd_addr + 8),
*((uint8_t *)kd->kd_addr + 10),
*((uint8_t *)kd->kd_addr + 12),
*((uint8_t *)kd->kd_addr + 14));
}
return (str);
}
static int
spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
{
int i;
spin_lock_init(lock);
INIT_LIST_HEAD(list);
for (i = 0; i < size; i++)
INIT_HLIST_HEAD(&kmem_table[i]);
return (0);
}
static void
spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
{
unsigned long flags;
kmem_debug_t *kd = NULL;
char str[17];
spin_lock_irqsave(lock, flags);
if (!list_empty(list))
printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
"size", "data", "func", "line");
list_for_each_entry(kd, list, kd_list) {
printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
(int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
kd->kd_func, kd->kd_line);
}
spin_unlock_irqrestore(lock, flags);
}
#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
int
spl_kmem_init(void)
{
#ifdef DEBUG_KMEM
kmem_alloc_used_set(0);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
return (0);
}
void
spl_kmem_fini(void)
{
#ifdef DEBUG_KMEM
/*
* Display all unreclaimed memory addresses, including the
* allocation size and the first few bytes of what's located
* at that address to aid in debugging. Performance is not
* a serious concern here since it is module unload time.
*/
if (kmem_alloc_used_read() != 0)
printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
(unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
}