mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-25 01:16:34 +03:00
Merge branch 'kmem-cache-optimization'
This branch contains kmem cache optimizations designed to resolve the lockups reported in zfsonlinux/zfs#922. The lockups were largely the result of spin lock contention in the slab under low memory conditions. Fundamentally, these changes are all designed to minimize that contention though a variety of methods. * Improved vmem cached deadlock detection * Track emergency objects in rbtree * Optimize spl_kmem_cache_free() * Never spin in kmem_cache_alloc() Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> zfsonlinux/zfs#922
This commit is contained in:
commit
366346c565
@ -31,6 +31,7 @@
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <asm/atomic.h>
|
||||
#include <sys/types.h>
|
||||
@ -340,6 +341,7 @@ enum {
|
||||
KMC_BIT_VMEM = 6, /* Use vmem cache */
|
||||
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
|
||||
KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */
|
||||
KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
|
||||
KMC_BIT_GROWING = 15, /* Growing in progress */
|
||||
KMC_BIT_REAPING = 16, /* Reaping in progress */
|
||||
KMC_BIT_DESTROY = 17, /* Destroy in progress */
|
||||
@ -366,6 +368,7 @@ typedef enum kmem_cbrc {
|
||||
#define KMC_VMEM (1 << KMC_BIT_VMEM)
|
||||
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
|
||||
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
|
||||
#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
|
||||
#define KMC_GROWING (1 << KMC_BIT_GROWING)
|
||||
#define KMC_REAPING (1 << KMC_BIT_REAPING)
|
||||
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
|
||||
@ -433,8 +436,8 @@ typedef struct spl_kmem_alloc {
|
||||
} spl_kmem_alloc_t;
|
||||
|
||||
typedef struct spl_kmem_emergency {
|
||||
struct rb_node ske_node; /* Emergency tree linkage */
|
||||
void *ske_obj; /* Buffer address */
|
||||
struct list_head ske_list; /* Emergency list linkage */
|
||||
} spl_kmem_emergency_t;
|
||||
|
||||
typedef struct spl_kmem_cache {
|
||||
@ -461,7 +464,7 @@ typedef struct spl_kmem_cache {
|
||||
struct list_head skc_list; /* List of caches linkage */
|
||||
struct list_head skc_complete_list;/* Completely alloc'ed */
|
||||
struct list_head skc_partial_list; /* Partially alloc'ed */
|
||||
struct list_head skc_emergency_list; /* Min sized objects */
|
||||
struct rb_root skc_emergency_tree; /* Min sized objects */
|
||||
spinlock_t skc_lock; /* Cache lock */
|
||||
wait_queue_head_t skc_waitq; /* Allocation waiters */
|
||||
uint64_t skc_slab_fail; /* Slab alloc failures */
|
||||
@ -473,6 +476,7 @@ typedef struct spl_kmem_cache {
|
||||
uint64_t skc_obj_total; /* Obj total current */
|
||||
uint64_t skc_obj_alloc; /* Obj alloc current */
|
||||
uint64_t skc_obj_max; /* Obj max historic */
|
||||
uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
|
||||
uint64_t skc_obj_emergency; /* Obj emergency current */
|
||||
uint64_t skc_obj_emergency_max; /* Obj emergency max */
|
||||
} spl_kmem_cache_t;
|
||||
|
@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
|
||||
SEXIT;
|
||||
}
|
||||
|
||||
static spl_kmem_emergency_t *
|
||||
spl_emergency_search(struct rb_root *root, void *obj)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
spl_kmem_emergency_t *ske;
|
||||
unsigned long address = (unsigned long)obj;
|
||||
|
||||
while (node) {
|
||||
ske = container_of(node, spl_kmem_emergency_t, ske_node);
|
||||
|
||||
if (address < (unsigned long)ske->ske_obj)
|
||||
node = node->rb_left;
|
||||
else if (address > (unsigned long)ske->ske_obj)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return ske;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int
|
||||
spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
|
||||
{
|
||||
struct rb_node **new = &(root->rb_node), *parent = NULL;
|
||||
spl_kmem_emergency_t *ske_tmp;
|
||||
unsigned long address = (unsigned long)ske->ske_obj;
|
||||
|
||||
while (*new) {
|
||||
ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
|
||||
|
||||
parent = *new;
|
||||
if (address < (unsigned long)ske_tmp->ske_obj)
|
||||
new = &((*new)->rb_left);
|
||||
else if (address > (unsigned long)ske_tmp->ske_obj)
|
||||
new = &((*new)->rb_right);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
rb_link_node(&ske->ske_node, parent, new);
|
||||
rb_insert_color(&ske->ske_node, root);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a single emergency object for use by the caller.
|
||||
* Allocate a single emergency object and track it in a red black tree.
|
||||
*/
|
||||
static int
|
||||
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
|
||||
@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
|
||||
SRETURN(-ENOMEM);
|
||||
}
|
||||
|
||||
spin_lock(&skc->skc_lock);
|
||||
empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
|
||||
if (likely(empty)) {
|
||||
skc->skc_obj_total++;
|
||||
skc->skc_obj_emergency++;
|
||||
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
|
||||
skc->skc_obj_emergency_max = skc->skc_obj_emergency;
|
||||
}
|
||||
spin_unlock(&skc->skc_lock);
|
||||
|
||||
if (unlikely(!empty)) {
|
||||
kfree(ske->ske_obj);
|
||||
kfree(ske);
|
||||
SRETURN(-EINVAL);
|
||||
}
|
||||
|
||||
if (skc->skc_ctor)
|
||||
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
|
||||
|
||||
spin_lock(&skc->skc_lock);
|
||||
skc->skc_obj_total++;
|
||||
skc->skc_obj_emergency++;
|
||||
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
|
||||
skc->skc_obj_emergency_max = skc->skc_obj_emergency;
|
||||
|
||||
list_add(&ske->ske_list, &skc->skc_emergency_list);
|
||||
spin_unlock(&skc->skc_lock);
|
||||
|
||||
*obj = ske->ske_obj;
|
||||
|
||||
SRETURN(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free the passed object if it is an emergency object or a normal slab
|
||||
* object. Currently this is done by walking what should be a short list of
|
||||
* emergency objects. If this proves to be too inefficient we can replace
|
||||
* the simple list with a hash.
|
||||
* Locate the passed object in the red black tree and free it.
|
||||
*/
|
||||
static int
|
||||
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
|
||||
{
|
||||
spl_kmem_emergency_t *m, *n, *ske = NULL;
|
||||
spl_kmem_emergency_t *ske;
|
||||
SENTRY;
|
||||
|
||||
spin_lock(&skc->skc_lock);
|
||||
list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
|
||||
if (m->ske_obj == obj) {
|
||||
list_del(&m->ske_list);
|
||||
skc->skc_obj_emergency--;
|
||||
skc->skc_obj_total--;
|
||||
ske = m;
|
||||
break;
|
||||
}
|
||||
ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
|
||||
if (likely(ske)) {
|
||||
rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
|
||||
skc->skc_obj_emergency--;
|
||||
skc->skc_obj_total--;
|
||||
}
|
||||
spin_unlock(&skc->skc_lock);
|
||||
|
||||
if (ske == NULL)
|
||||
if (unlikely(ske == NULL))
|
||||
SRETURN(-ENOENT);
|
||||
|
||||
if (skc->skc_dtor)
|
||||
@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
|
||||
INIT_LIST_HEAD(&skc->skc_list);
|
||||
INIT_LIST_HEAD(&skc->skc_complete_list);
|
||||
INIT_LIST_HEAD(&skc->skc_partial_list);
|
||||
INIT_LIST_HEAD(&skc->skc_emergency_list);
|
||||
skc->skc_emergency_tree = RB_ROOT;
|
||||
spin_lock_init(&skc->skc_lock);
|
||||
init_waitqueue_head(&skc->skc_waitq);
|
||||
skc->skc_slab_fail = 0;
|
||||
@ -1495,6 +1542,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
|
||||
skc->skc_obj_total = 0;
|
||||
skc->skc_obj_alloc = 0;
|
||||
skc->skc_obj_max = 0;
|
||||
skc->skc_obj_deadlock = 0;
|
||||
skc->skc_obj_emergency = 0;
|
||||
skc->skc_obj_emergency_max = 0;
|
||||
|
||||
@ -1589,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
|
||||
ASSERT3U(skc->skc_obj_total, ==, 0);
|
||||
ASSERT3U(skc->skc_obj_emergency, ==, 0);
|
||||
ASSERT(list_empty(&skc->skc_complete_list));
|
||||
ASSERT(list_empty(&skc->skc_emergency_list));
|
||||
|
||||
kmem_free(skc->skc_name, skc->skc_name_size);
|
||||
spin_unlock(&skc->skc_lock);
|
||||
@ -1662,6 +1709,7 @@ spl_cache_grow_work(void *data)
|
||||
|
||||
atomic_dec(&skc->skc_ref);
|
||||
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
|
||||
clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
|
||||
wake_up_all(&skc->skc_waitq);
|
||||
spin_unlock(&skc->skc_lock);
|
||||
|
||||
@ -1677,13 +1725,20 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
|
||||
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
|
||||
}
|
||||
|
||||
static int
|
||||
spl_cache_reclaim_wait(void *word)
|
||||
{
|
||||
schedule();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* No available objects on any slabs, create a new slab.
|
||||
*/
|
||||
static int
|
||||
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
|
||||
{
|
||||
int remaining, rc = 0;
|
||||
int remaining, rc;
|
||||
SENTRY;
|
||||
|
||||
ASSERT(skc->skc_magic == SKC_MAGIC);
|
||||
@ -1691,12 +1746,14 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
|
||||
*obj = NULL;
|
||||
|
||||
/*
|
||||
* Before allocating a new slab check if the slab is being reaped.
|
||||
* If it is there is a good chance we can wait until it finishes
|
||||
* and then use one of the newly freed but not aged-out slabs.
|
||||
* Before allocating a new slab wait for any reaping to complete and
|
||||
* then return so the local magazine can be rechecked for new objects.
|
||||
*/
|
||||
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
|
||||
SRETURN(-EAGAIN);
|
||||
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
|
||||
rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
|
||||
spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
|
||||
SRETURN(rc ? rc : -EAGAIN);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is handled by dispatching a work request to the global work
|
||||
@ -1722,17 +1779,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow a single timer tick before falling back to synchronously
|
||||
* allocating the minimum about of memory required by the caller.
|
||||
* The goal here is to only detect the rare case where a virtual slab
|
||||
* allocation has deadlocked. We must be careful to minimize the use
|
||||
* of emergency objects which are more expensive to track. Therefore,
|
||||
* we set a very long timeout for the asynchronous allocation and if
|
||||
* the timeout is reached the cache is flagged as deadlocked. From
|
||||
* this point only new emergency objects will be allocated until the
|
||||
* asynchronous allocation completes and clears the deadlocked flag.
|
||||
*/
|
||||
remaining = wait_event_timeout(skc->skc_waitq,
|
||||
spl_cache_grow_wait(skc), 1);
|
||||
if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
|
||||
rc = spl_emergency_alloc(skc, flags, obj);
|
||||
} else {
|
||||
remaining = wait_event_timeout(skc->skc_waitq,
|
||||
spl_cache_grow_wait(skc), HZ);
|
||||
|
||||
if (remaining == 0) {
|
||||
if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags))
|
||||
rc = -ENOMEM;
|
||||
else
|
||||
rc = spl_emergency_alloc(skc, flags, obj);
|
||||
if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
|
||||
spin_lock(&skc->skc_lock);
|
||||
if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
|
||||
set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
|
||||
skc->skc_obj_deadlock++;
|
||||
}
|
||||
spin_unlock(&skc->skc_lock);
|
||||
}
|
||||
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
|
||||
SRETURN(rc);
|
||||
@ -1962,11 +2032,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
|
||||
atomic_inc(&skc->skc_ref);
|
||||
|
||||
/*
|
||||
* Emergency objects are never part of the virtual address space
|
||||
* so if we get a virtual address we can optimize this check out.
|
||||
* Only virtual slabs may have emergency objects and these objects
|
||||
* are guaranteed to have physical addresses. They must be removed
|
||||
* from the tree of emergency objects and the freed.
|
||||
*/
|
||||
if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
|
||||
SGOTO(out, 0);
|
||||
if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
|
||||
SGOTO(out, spl_emergency_free(skc, obj));
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
@ -2094,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
|
||||
/* Reclaim from the cache, ignoring it's age and delay. */
|
||||
spl_slab_reclaim(skc, count, 1);
|
||||
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
|
||||
smp_mb__after_clear_bit();
|
||||
wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
|
||||
|
||||
atomic_dec(&skc->skc_ref);
|
||||
|
||||
SEXIT;
|
||||
|
@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f)
|
||||
"--------------------- cache ----------"
|
||||
"--------------------------------------------- "
|
||||
"----- slab ------ "
|
||||
"---- object -----------------\n");
|
||||
"---- object ----- "
|
||||
"--- emergency ---\n");
|
||||
seq_printf(f,
|
||||
"name "
|
||||
" flags size alloc slabsize objsize "
|
||||
"total alloc max "
|
||||
"total alloc max emerg max\n");
|
||||
"total alloc max "
|
||||
"dlock alloc max\n");
|
||||
}
|
||||
|
||||
static int
|
||||
@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p)
|
||||
spin_lock(&skc->skc_lock);
|
||||
seq_printf(f, "%-36s ", skc->skc_name);
|
||||
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
|
||||
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
|
||||
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
|
||||
(long unsigned)skc->skc_flags,
|
||||
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
|
||||
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
|
||||
@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p)
|
||||
(long unsigned)skc->skc_obj_total,
|
||||
(long unsigned)skc->skc_obj_alloc,
|
||||
(long unsigned)skc->skc_obj_max,
|
||||
(long unsigned)skc->skc_obj_deadlock,
|
||||
(long unsigned)skc->skc_obj_emergency,
|
||||
(long unsigned)skc->skc_obj_emergency_max);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user