mirror_zfs/module/os/linux/spl/spl-kmem.c
Brian Behlendorf d0249a4bd0
Replace ZFS on Linux references with OpenZFS
This change updates the documentation to refer to the project
as OpenZFS instead ZFS on Linux.  Web links have been updated
to refer to https://github.com/openzfs/zfs.  The extraneous
zfsonlinux.org web links in the ZED and SPL sources have been
dropped.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Richard Laager <rlaager@wiktel.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #11007
2020-10-08 20:10:13 -07:00

618 lines
16 KiB
C

/*
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
* Copyright (C) 2007 The Regents of the University of California.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Brian Behlendorf <behlendorf1@llnl.gov>.
* UCRL-CODE-235197
*
* This file is part of the SPL, Solaris Porting Layer.
*
* The SPL is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* The SPL is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/vmem.h>
/*
* As a general rule kmem_alloc() allocations should be small, preferably
* just a few pages since they must by physically contiguous. Therefore, a
* rate limited warning will be printed to the console for any kmem_alloc()
* which exceeds a reasonable threshold.
*
* The default warning threshold is set to sixteen pages but capped at 64K to
* accommodate systems using large pages. This value was selected to be small
* enough to ensure the largest allocations are quickly noticed and fixed.
* But large enough to avoid logging any warnings when a allocation size is
* larger than optimal but not a serious concern. Since this value is tunable,
* developers are encouraged to set it lower when testing so any new largish
* allocations are quickly caught. These warnings may be disabled by setting
* the threshold to zero.
*/
/* BEGIN CSTYLED */
unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
module_param(spl_kmem_alloc_warn, uint, 0644);
MODULE_PARM_DESC(spl_kmem_alloc_warn,
"Warning threshold in bytes for a kmem_alloc()");
EXPORT_SYMBOL(spl_kmem_alloc_warn);
/*
* Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
* Allocations which are marginally smaller than this limit may succeed but
* should still be avoided due to the expense of locating a contiguous range
* of free pages. Therefore, a maximum kmem size with reasonable safely
* margin of 4x is set. Kmem_alloc() allocations larger than this maximum
* will quickly fail. Vmem_alloc() allocations less than or equal to this
* value will use kmalloc(), but shift to vmalloc() when exceeding this value.
*/
unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
module_param(spl_kmem_alloc_max, uint, 0644);
MODULE_PARM_DESC(spl_kmem_alloc_max,
"Maximum size in bytes for a kmem_alloc()");
EXPORT_SYMBOL(spl_kmem_alloc_max);
/* END CSTYLED */
int
kmem_debugging(void)
{
return (0);
}
EXPORT_SYMBOL(kmem_debugging);
char *
kmem_vasprintf(const char *fmt, va_list ap)
{
va_list aq;
char *ptr;
do {
va_copy(aq, ap);
ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
va_end(aq);
} while (ptr == NULL);
return (ptr);
}
EXPORT_SYMBOL(kmem_vasprintf);
char *
kmem_asprintf(const char *fmt, ...)
{
va_list ap;
char *ptr;
do {
va_start(ap, fmt);
ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
va_end(ap);
} while (ptr == NULL);
return (ptr);
}
EXPORT_SYMBOL(kmem_asprintf);
static char *
__strdup(const char *str, int flags)
{
char *ptr;
int n;
n = strlen(str);
ptr = kmalloc(n + 1, kmem_flags_convert(flags));
if (ptr)
memcpy(ptr, str, n + 1);
return (ptr);
}
char *
kmem_strdup(const char *str)
{
return (__strdup(str, KM_SLEEP));
}
EXPORT_SYMBOL(kmem_strdup);
void
kmem_strfree(char *str)
{
kfree(str);
}
EXPORT_SYMBOL(kmem_strfree);
void *
spl_kvmalloc(size_t size, gfp_t lflags)
{
#ifdef HAVE_KVMALLOC
/*
* GFP_KERNEL allocations can safely use kvmalloc which may
* improve performance by avoiding a) high latency caused by
* vmalloc's on-access allocation, b) performance loss due to
* MMU memory address mapping and c) vmalloc locking overhead.
* This has the side-effect that the slab statistics will
* incorrectly report this as a vmem allocation, but that is
* purely cosmetic.
*/
if ((lflags & GFP_KERNEL) == GFP_KERNEL)
return (kvmalloc(size, lflags));
#endif
gfp_t kmalloc_lflags = lflags;
if (size > PAGE_SIZE) {
/*
* We need to set __GFP_NOWARN here since spl_kvmalloc is not
* only called by spl_kmem_alloc_impl but can be called
* directly with custom lflags, too. In that case
* kmem_flags_convert does not get called, which would
* implicitly set __GFP_NOWARN.
*/
kmalloc_lflags |= __GFP_NOWARN;
/*
* N.B. __GFP_RETRY_MAYFAIL is supported only for large
* e (>32kB) allocations.
*
* We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
* for !costly requests because there is no other way to tell
* the allocator that we want to fail rather than retry
* endlessly.
*/
if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
(size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
kmalloc_lflags |= __GFP_NORETRY;
}
}
/*
* We first try kmalloc - even for big sizes - and fall back to
* spl_vmalloc if that fails.
*
* For non-__GFP-RECLAIM allocations we always stick to
* kmalloc_node, and fail when kmalloc is not successful (returns
* NULL).
* We cannot fall back to spl_vmalloc in this case because spl_vmalloc
* internally uses GPF_KERNEL allocations.
*/
void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
if (ptr || size <= PAGE_SIZE ||
(lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
return (ptr);
}
return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
}
/*
* General purpose unified implementation of kmem_alloc(). It is an
* amalgamation of Linux and Illumos allocator design. It should never be
* exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
* relatively portable. Consumers may only access this function through
* wrappers that enforce the common flags to ensure portability.
*/
inline void *
spl_kmem_alloc_impl(size_t size, int flags, int node)
{
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
/*
* Log abnormally large allocations and rate limit the console output.
* Allocations larger than spl_kmem_alloc_warn should be performed
* through the vmem_alloc()/vmem_zalloc() interfaces.
*/
if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
!(flags & KM_VMEM)) {
printk(KERN_WARNING
"Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
"https://github.com/openzfs/zfs/issues/new\n",
(unsigned long)size, flags);
dump_stack();
}
/*
* Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
* unlike kmem_alloc() with KM_SLEEP on Illumos.
*/
do {
/*
* Calling kmalloc_node() when the size >= spl_kmem_alloc_max
* is unsafe. This must fail for all for kmem_alloc() and
* kmem_zalloc() callers.
*
* For vmem_alloc() and vmem_zalloc() callers it is permissible
* to use spl_vmalloc(). However, in general use of
* spl_vmalloc() is strongly discouraged because a global lock
* must be acquired. Contention on this lock can significantly
* impact performance so frequently manipulating the virtual
* address space is strongly discouraged.
*/
if (size > spl_kmem_alloc_max) {
if (flags & KM_VMEM) {
ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
} else {
return (NULL);
}
} else {
if (flags & KM_VMEM) {
ptr = spl_kvmalloc(size, lflags);
} else {
ptr = kmalloc_node(size, lflags, node);
}
}
if (likely(ptr) || (flags & KM_NOSLEEP))
return (ptr);
/*
* Try hard to satisfy the allocation. However, when progress
* cannot be made, the allocation is allowed to fail.
*/
if ((lflags & GFP_KERNEL) == GFP_KERNEL)
lflags |= __GFP_RETRY_MAYFAIL;
/*
* Use cond_resched() instead of congestion_wait() to avoid
* deadlocking systems where there are no block devices.
*/
cond_resched();
} while (1);
return (NULL);
}
inline void
spl_kmem_free_impl(const void *buf, size_t size)
{
if (is_vmalloc_addr(buf))
vfree(buf);
else
kfree(buf);
}
/*
* Memory allocation and accounting for kmem_* * style allocations. When
* DEBUG_KMEM is enabled the total memory allocated will be tracked and
* any memory leaked will be reported during module unload.
*
* ./configure --enable-debug-kmem
*/
#ifdef DEBUG_KMEM
/* Shim layer memory accounting */
#ifdef HAVE_ATOMIC64_T
atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
unsigned long long kmem_alloc_max = 0;
#else /* HAVE_ATOMIC64_T */
atomic_t kmem_alloc_used = ATOMIC_INIT(0);
unsigned long long kmem_alloc_max = 0;
#endif /* HAVE_ATOMIC64_T */
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
inline void *
spl_kmem_alloc_debug(size_t size, int flags, int node)
{
void *ptr;
ptr = spl_kmem_alloc_impl(size, flags, node);
if (ptr) {
kmem_alloc_used_add(size);
if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
kmem_alloc_max = kmem_alloc_used_read();
}
return (ptr);
}
inline void
spl_kmem_free_debug(const void *ptr, size_t size)
{
kmem_alloc_used_sub(size);
spl_kmem_free_impl(ptr, size);
}
/*
* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
* but also the location of every alloc and free. When the SPL module is
* unloaded a list of all leaked addresses and where they were allocated
* will be dumped to the console. Enabling this feature has a significant
* impact on performance but it makes finding memory leaks straight forward.
*
* Not surprisingly with debugging enabled the xmem_locks are very highly
* contended particularly on xfree(). If we want to run with this detailed
* debugging enabled for anything other than debugging we need to minimize
* the contention by moving to a lock per xmem_table entry model.
*
* ./configure --enable-debug-kmem-tracking
*/
#ifdef DEBUG_KMEM_TRACKING
#include <linux/hash.h>
#include <linux/ctype.h>
#define KMEM_HASH_BITS 10
#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
typedef struct kmem_debug {
struct hlist_node kd_hlist; /* Hash node linkage */
struct list_head kd_list; /* List of all allocations */
void *kd_addr; /* Allocation pointer */
size_t kd_size; /* Allocation size */
const char *kd_func; /* Allocation function */
int kd_line; /* Allocation line */
} kmem_debug_t;
static spinlock_t kmem_lock;
static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
static struct list_head kmem_list;
static kmem_debug_t *
kmem_del_init(spinlock_t *lock, struct hlist_head *table,
int bits, const void *addr)
{
struct hlist_head *head;
struct hlist_node *node = NULL;
struct kmem_debug *p;
unsigned long flags;
spin_lock_irqsave(lock, flags);
head = &table[hash_ptr((void *)addr, bits)];
hlist_for_each(node, head) {
p = list_entry(node, struct kmem_debug, kd_hlist);
if (p->kd_addr == addr) {
hlist_del_init(&p->kd_hlist);
list_del_init(&p->kd_list);
spin_unlock_irqrestore(lock, flags);
return (p);
}
}
spin_unlock_irqrestore(lock, flags);
return (NULL);
}
inline void *
spl_kmem_alloc_track(size_t size, int flags,
const char *func, int line, int node)
{
void *ptr = NULL;
kmem_debug_t *dptr;
unsigned long irq_flags;
dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
if (dptr == NULL)
return (NULL);
dptr->kd_func = __strdup(func, flags);
if (dptr->kd_func == NULL) {
kfree(dptr);
return (NULL);
}
ptr = spl_kmem_alloc_debug(size, flags, node);
if (ptr == NULL) {
kfree(dptr->kd_func);
kfree(dptr);
return (NULL);
}
INIT_HLIST_NODE(&dptr->kd_hlist);
INIT_LIST_HEAD(&dptr->kd_list);
dptr->kd_addr = ptr;
dptr->kd_size = size;
dptr->kd_line = line;
spin_lock_irqsave(&kmem_lock, irq_flags);
hlist_add_head(&dptr->kd_hlist,
&kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
list_add_tail(&dptr->kd_list, &kmem_list);
spin_unlock_irqrestore(&kmem_lock, irq_flags);
return (ptr);
}
inline void
spl_kmem_free_track(const void *ptr, size_t size)
{
kmem_debug_t *dptr;
/* Ignore NULL pointer since we haven't tracked it at all */
if (ptr == NULL)
return;
/* Must exist in hash due to kmem_alloc() */
dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
ASSERT3P(dptr, !=, NULL);
ASSERT3S(dptr->kd_size, ==, size);
kfree(dptr->kd_func);
kfree(dptr);
spl_kmem_free_debug(ptr, size);
}
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
/*
* Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
*/
void *
spl_kmem_alloc(size_t size, int flags, const char *func, int line)
{
ASSERT0(flags & ~KM_PUBLIC_MASK);
#if !defined(DEBUG_KMEM)
return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
#else
return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
#endif
}
EXPORT_SYMBOL(spl_kmem_alloc);
void *
spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
{
ASSERT0(flags & ~KM_PUBLIC_MASK);
flags |= KM_ZERO;
#if !defined(DEBUG_KMEM)
return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
#else
return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
#endif
}
EXPORT_SYMBOL(spl_kmem_zalloc);
void
spl_kmem_free(const void *buf, size_t size)
{
#if !defined(DEBUG_KMEM)
return (spl_kmem_free_impl(buf, size));
#elif !defined(DEBUG_KMEM_TRACKING)
return (spl_kmem_free_debug(buf, size));
#else
return (spl_kmem_free_track(buf, size));
#endif
}
EXPORT_SYMBOL(spl_kmem_free);
#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
static char *
spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
{
int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
int i, flag = 1;
ASSERT(str != NULL && len >= 17);
memset(str, 0, len);
/*
* Check for a fully printable string, and while we are at
* it place the printable characters in the passed buffer.
*/
for (i = 0; i < size; i++) {
str[i] = ((char *)(kd->kd_addr))[i];
if (isprint(str[i])) {
continue;
} else {
/*
* Minimum number of printable characters found
* to make it worthwhile to print this as ascii.
*/
if (i > min)
break;
flag = 0;
break;
}
}
if (!flag) {
sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
*((uint8_t *)kd->kd_addr),
*((uint8_t *)kd->kd_addr + 2),
*((uint8_t *)kd->kd_addr + 4),
*((uint8_t *)kd->kd_addr + 6),
*((uint8_t *)kd->kd_addr + 8),
*((uint8_t *)kd->kd_addr + 10),
*((uint8_t *)kd->kd_addr + 12),
*((uint8_t *)kd->kd_addr + 14));
}
return (str);
}
static int
spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
{
int i;
spin_lock_init(lock);
INIT_LIST_HEAD(list);
for (i = 0; i < size; i++)
INIT_HLIST_HEAD(&kmem_table[i]);
return (0);
}
static void
spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
{
unsigned long flags;
kmem_debug_t *kd = NULL;
char str[17];
spin_lock_irqsave(lock, flags);
if (!list_empty(list))
printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
"size", "data", "func", "line");
list_for_each_entry(kd, list, kd_list) {
printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
(int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
kd->kd_func, kd->kd_line);
}
spin_unlock_irqrestore(lock, flags);
}
#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
int
spl_kmem_init(void)
{
#ifdef DEBUG_KMEM
kmem_alloc_used_set(0);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
return (0);
}
void
spl_kmem_fini(void)
{
#ifdef DEBUG_KMEM
/*
* Display all unreclaimed memory addresses, including the
* allocation size and the first few bytes of what's located
* at that address to aid in debugging. Performance is not
* a serious concern here since it is module unload time.
*/
if (kmem_alloc_used_read() != 0)
printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
(unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
#ifdef DEBUG_KMEM_TRACKING
spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
}