Merge branch 'kmem-cache-optimization'

This branch contains kmem cache optimizations designed to resolve
the lockups reported in zfsonlinux/zfs#922.  The lockups were
largely the result of spin lock contention in the slab under low
memory conditions.  Fundamentally, these changes are all designed
to minimize that contention though a variety of methods.

  * Improved vmem cached deadlock detection
  * Track emergency objects in rbtree
  * Optimize spl_kmem_cache_free()
  * Never spin in kmem_cache_alloc()

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
zfsonlinux/zfs#922
This commit is contained in:
Brian Behlendorf 2012-11-08 11:00:23 -08:00
commit 366346c565
3 changed files with 131 additions and 50 deletions

View File

@ -31,6 +31,7 @@
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/rbtree.h>
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
@ -340,6 +341,7 @@ enum {
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */
KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
@ -366,6 +368,7 @@ typedef enum kmem_cbrc {
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
@ -433,8 +436,8 @@ typedef struct spl_kmem_alloc {
} spl_kmem_alloc_t;
typedef struct spl_kmem_emergency {
struct rb_node ske_node; /* Emergency tree linkage */
void *ske_obj; /* Buffer address */
struct list_head ske_list; /* Emergency list linkage */
} spl_kmem_emergency_t;
typedef struct spl_kmem_cache {
@ -461,7 +464,7 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */
struct list_head skc_emergency_list; /* Min sized objects */
struct rb_root skc_emergency_tree; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */
wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */
@ -473,6 +476,7 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */
uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
uint64_t skc_obj_emergency; /* Obj emergency current */
uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t;

View File

@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
SEXIT;
}
static spl_kmem_emergency_t *
spl_emergency_search(struct rb_root *root, void *obj)
{
struct rb_node *node = root->rb_node;
spl_kmem_emergency_t *ske;
unsigned long address = (unsigned long)obj;
while (node) {
ske = container_of(node, spl_kmem_emergency_t, ske_node);
if (address < (unsigned long)ske->ske_obj)
node = node->rb_left;
else if (address > (unsigned long)ske->ske_obj)
node = node->rb_right;
else
return ske;
}
return NULL;
}
static int
spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
spl_kmem_emergency_t *ske_tmp;
unsigned long address = (unsigned long)ske->ske_obj;
while (*new) {
ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
parent = *new;
if (address < (unsigned long)ske_tmp->ske_obj)
new = &((*new)->rb_left);
else if (address > (unsigned long)ske_tmp->ske_obj)
new = &((*new)->rb_right);
else
return 0;
}
rb_link_node(&ske->ske_node, parent, new);
rb_insert_color(&ske->ske_node, root);
return 1;
}
/*
* Allocate a single emergency object for use by the caller.
* Allocate a single emergency object and track it in a red black tree.
*/
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
SRETURN(-ENOMEM);
}
if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
spin_lock(&skc->skc_lock);
empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
if (likely(empty)) {
skc->skc_obj_total++;
skc->skc_obj_emergency++;
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
skc->skc_obj_emergency_max = skc->skc_obj_emergency;
list_add(&ske->ske_list, &skc->skc_emergency_list);
}
spin_unlock(&skc->skc_lock);
if (unlikely(!empty)) {
kfree(ske->ske_obj);
kfree(ske);
SRETURN(-EINVAL);
}
if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
*obj = ske->ske_obj;
SRETURN(0);
}
/*
* Free the passed object if it is an emergency object or a normal slab
* object. Currently this is done by walking what should be a short list of
* emergency objects. If this proves to be too inefficient we can replace
* the simple list with a hash.
* Locate the passed object in the red black tree and free it.
*/
static int
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *m, *n, *ske = NULL;
spl_kmem_emergency_t *ske;
SENTRY;
spin_lock(&skc->skc_lock);
list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
if (m->ske_obj == obj) {
list_del(&m->ske_list);
ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
if (likely(ske)) {
rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
ske = m;
break;
}
}
spin_unlock(&skc->skc_lock);
if (ske == NULL)
if (unlikely(ske == NULL))
SRETURN(-ENOENT);
if (skc->skc_dtor)
@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list);
INIT_LIST_HEAD(&skc->skc_emergency_list);
skc->skc_emergency_tree = RB_ROOT;
spin_lock_init(&skc->skc_lock);
init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0;
@ -1495,6 +1542,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0;
skc->skc_obj_deadlock = 0;
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;
@ -1589,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
ASSERT(list_empty(&skc->skc_emergency_list));
kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
@ -1662,6 +1709,7 @@ spl_cache_grow_work(void *data)
atomic_dec(&skc->skc_ref);
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
spin_unlock(&skc->skc_lock);
@ -1677,13 +1725,20 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
}
static int
spl_cache_reclaim_wait(void *word)
{
schedule();
return 0;
}
/*
* No available objects on any slabs, create a new slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
int remaining, rc = 0;
int remaining, rc;
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
@ -1691,12 +1746,14 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
*obj = NULL;
/*
* Before allocating a new slab check if the slab is being reaped.
* If it is there is a good chance we can wait until it finishes
* and then use one of the newly freed but not aged-out slabs.
* Before allocating a new slab wait for any reaping to complete and
* then return so the local magazine can be rechecked for new objects.
*/
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
SRETURN(-EAGAIN);
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
SRETURN(rc ? rc : -EAGAIN);
}
/*
* This is handled by dispatching a work request to the global work
@ -1722,17 +1779,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
}
/*
* Allow a single timer tick before falling back to synchronously
* allocating the minimum about of memory required by the caller.
* The goal here is to only detect the rare case where a virtual slab
* allocation has deadlocked. We must be careful to minimize the use
* of emergency objects which are more expensive to track. Therefore,
* we set a very long timeout for the asynchronous allocation and if
* the timeout is reached the cache is flagged as deadlocked. From
* this point only new emergency objects will be allocated until the
* asynchronous allocation completes and clears the deadlocked flag.
*/
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), 1);
if (remaining == 0) {
if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags))
rc = -ENOMEM;
else
if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
rc = spl_emergency_alloc(skc, flags, obj);
} else {
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), HZ);
if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
spin_lock(&skc->skc_lock);
if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
skc->skc_obj_deadlock++;
}
spin_unlock(&skc->skc_lock);
}
rc = -ENOMEM;
}
SRETURN(rc);
@ -1962,11 +2032,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
atomic_inc(&skc->skc_ref);
/*
* Emergency objects are never part of the virtual address space
* so if we get a virtual address we can optimize this check out.
* Only virtual slabs may have emergency objects and these objects
* are guaranteed to have physical addresses. They must be removed
* from the tree of emergency objects and the freed.
*/
if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
SGOTO(out, 0);
if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
SGOTO(out, spl_emergency_free(skc, obj));
local_irq_save(flags);
@ -2094,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
/* Reclaim from the cache, ignoring it's age and delay. */
spl_slab_reclaim(skc, count, 1);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
smp_mb__after_clear_bit();
wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
atomic_dec(&skc->skc_ref);
SEXIT;

View File

@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f)
"--------------------- cache ----------"
"--------------------------------------------- "
"----- slab ------ "
"---- object -----------------\n");
"---- object ----- "
"--- emergency ---\n");
seq_printf(f,
"name "
" flags size alloc slabsize objsize "
"total alloc max "
"total alloc max emerg max\n");
"total alloc max "
"dlock alloc max\n");
}
static int
@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p)
spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
(long unsigned)skc->skc_flags,
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p)
(long unsigned)skc->skc_obj_total,
(long unsigned)skc->skc_obj_alloc,
(long unsigned)skc->skc_obj_max,
(long unsigned)skc->skc_obj_deadlock,
(long unsigned)skc->skc_obj_emergency,
(long unsigned)skc->skc_obj_emergency_max);