Emergency slab objects

This patch is designed to resolve a deadlock which can occur with
__vmalloc() based slabs.  The issue is that the Linux kernel does
not honor the flags passed to __vmalloc().  This makes it unsafe
to use in a writeback context.  Unfortunately, this is a use case
ZFS depends on for correct operation.

Fixing this issue in the upstream kernel was pursued and patches
are available which resolve the issue.

  https://bugs.gentoo.org/show_bug.cgi?id=416685

However, these changes were rejected because upstream felt that
using __vmalloc() in the context of writeback should never be done.
Their solution was for us to rewrite parts of ZFS to accomidate
the Linux VM.

While that is probably the right long term solution, and it is
something we want to pursue, it is not a trivial task and will
likely destabilize the existing code.  This work has been planned
for the 0.7.0 release but in the meanwhile we want to improve the
SPL slab implementation to accomidate this expected ZFS usage.

This is accomplished by performing the __vmalloc() asynchronously
in the context of a work queue.  This doesn't prevent the posibility
of the worker thread from deadlocking.  However, the caller can now
safely block on a wait queue for the slab allocation to complete.

Normally this will occur in a reasonable amount of time and the
caller will be woken up when the new slab is available,.  The objects
will then get cached in the per-cpu magazines and everything will
proceed as usual.

However, if the __vmalloc() deadlocks for the reasons described
above, or is just very slow, then the callers on the wait queues
will timeout out.  When this rare situation occurs they will attempt
to kmalloc() a single minimally sized object using the GFP_NOIO flags.
This allocation will not deadlock because kmalloc() will honor the
passed flags and the caller will be able to make forward progress.

As long as forward progress can be maintained then even if the
worker thread is deadlocked the critical thread will make progress.
This will eventually allow the deadlocked worker thread to complete
and normal operation will resume.

These emergency allocations will likely be slow since they require
contiguous pages.  However, their use should be rare so the impact
is expected to be minimal.  If that turns out not to be the case in
practice further optimizations are possible.

One additional concern is if these emergency objects are long lived.
Right now they are simply tracked on a list which must be walked when
an object is freed.  Is they accumulate on a system and the list
grows freeing objects will become more expensive.  This could be
handled relatively easily by using a hash instead of a list, but that
optimization (if needed) is left for a follow up patch.

Additionally, these emeregency objects could be repacked in to existing
slabs as objects are freed if the kmem_cache_set_move() functionality
was implemented.  See issue https://github.com/zfsonlinux/spl/issues/26
for full details.  This work would also help reduce ZFS's memory
fragmentation problems.

The /proc/spl/kmem/slab file has had two new columns added at the
end.  The 'emerg' column reports the current number of these emergency
objects in use for the cache, and the following 'max' column shows
the historical worst case.  These value should give us a good idea
of how often these objects are needed.  Based on these values under
real use cases we can tune the default behavior.

Lastly, as a side benefit using a single work queue for the slab
allocations should reduce cpu contention on the global virtual address
space lock.   This should manifest itself as reduced cpu usage for
the system.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2012-08-07 16:59:50 -07:00
parent 587045a638
commit e2dcc6e2b8
3 changed files with 216 additions and 44 deletions

View File

@ -291,6 +291,7 @@ enum {
KMC_BIT_KMEM = 5, /* Use kmem cache */ KMC_BIT_KMEM = 5, /* Use kmem cache */
KMC_BIT_VMEM = 6, /* Use vmem cache */ KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */ KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */
KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
@ -315,6 +316,7 @@ typedef enum kmem_cbrc {
#define KMC_KMEM (1 << KMC_BIT_KMEM) #define KMC_KMEM (1 << KMC_BIT_KMEM)
#define KMC_VMEM (1 << KMC_BIT_VMEM) #define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) #define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING) #define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY) #define KMC_DESTROY (1 << KMC_BIT_DESTROY)
#define KMC_TOTAL (1 << KMC_BIT_TOTAL) #define KMC_TOTAL (1 << KMC_BIT_TOTAL)
@ -374,6 +376,17 @@ typedef struct spl_kmem_slab {
uint32_t sks_ref; /* Ref count used objects */ uint32_t sks_ref; /* Ref count used objects */
} spl_kmem_slab_t; } spl_kmem_slab_t;
typedef struct spl_kmem_alloc {
struct spl_kmem_cache *ska_cache; /* Owned by cache */
int ska_flags; /* Allocation flags */
struct delayed_work ska_work; /* Allocation work */
} spl_kmem_alloc_t;
typedef struct spl_kmem_emergency {
void *ske_obj; /* Buffer address */
struct list_head ske_list; /* Emergency list linkage */
} spl_kmem_emergency_t;
typedef struct spl_kmem_cache { typedef struct spl_kmem_cache {
uint32_t skc_magic; /* Sanity magic */ uint32_t skc_magic; /* Sanity magic */
uint32_t skc_name_size; /* Name length */ uint32_t skc_name_size; /* Name length */
@ -398,7 +411,9 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */ struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */ struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */
struct list_head skc_emergency_list; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */ spinlock_t skc_lock; /* Cache lock */
wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */ uint64_t skc_slab_fail; /* Slab alloc failures */
uint64_t skc_slab_create;/* Slab creates */ uint64_t skc_slab_create;/* Slab creates */
uint64_t skc_slab_destroy;/* Slab destroys */ uint64_t skc_slab_destroy;/* Slab destroys */
@ -408,6 +423,8 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */ uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */ uint64_t skc_obj_max; /* Obj max historic */
uint64_t skc_obj_emergency; /* Obj emergency current */
uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t; } spl_kmem_cache_t;
#define kmem_cache_t spl_kmem_cache_t #define kmem_cache_t spl_kmem_cache_t

View File

@ -1143,6 +1143,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
SEXIT; SEXIT;
} }
/*
* Allocate a single emergency object for use by the caller.
*/
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
{
spl_kmem_emergency_t *ske;
int empty;
SENTRY;
/* Last chance use a partial slab if one now exists */
spin_lock(&skc->skc_lock);
empty = list_empty(&skc->skc_partial_list);
spin_unlock(&skc->skc_lock);
if (!empty)
SRETURN(-EEXIST);
ske = kmalloc(sizeof(*ske), flags);
if (ske == NULL)
SRETURN(-ENOMEM);
ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
if (ske->ske_obj == NULL) {
kfree(ske);
SRETURN(-ENOMEM);
}
if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
spin_lock(&skc->skc_lock);
skc->skc_obj_total++;
skc->skc_obj_emergency++;
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
skc->skc_obj_emergency_max = skc->skc_obj_emergency;
list_add(&ske->ske_list, &skc->skc_emergency_list);
spin_unlock(&skc->skc_lock);
*obj = ske->ske_obj;
SRETURN(0);
}
/*
* Free the passed object if it is an emergency object or a normal slab
* object. Currently this is done by walking what should be a short list of
* emergency objects. If this proves to be too inefficient we can replace
* the simple list with a hash.
*/
static int
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *m, *n, *ske = NULL;
SENTRY;
spin_lock(&skc->skc_lock);
list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
if (m->ske_obj == obj) {
list_del(&m->ske_list);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
ske = m;
break;
}
}
spin_unlock(&skc->skc_lock);
if (ske == NULL)
SRETURN(-ENOENT);
if (skc->skc_dtor)
skc->skc_dtor(ske->ske_obj, skc->skc_private);
kfree(ske->ske_obj);
kfree(ske);
SRETURN(0);
}
/* /*
* Called regularly on all caches to age objects out of the magazines * Called regularly on all caches to age objects out of the magazines
* which have not been access in skc->skc_delay seconds. This prevents * which have not been access in skc->skc_delay seconds. This prevents
@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list); INIT_LIST_HEAD(&skc->skc_partial_list);
INIT_LIST_HEAD(&skc->skc_emergency_list);
spin_lock_init(&skc->skc_lock); spin_lock_init(&skc->skc_lock);
init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0; skc->skc_slab_fail = 0;
skc->skc_slab_create = 0; skc->skc_slab_create = 0;
skc->skc_slab_destroy = 0; skc->skc_slab_destroy = 0;
@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0; skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0; skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0; skc->skc_obj_max = 0;
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;
if (align) { if (align) {
VERIFY(ISP2(align)); VERIFY(ISP2(align));
@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0);
ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0);
ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list)); ASSERT(list_empty(&skc->skc_complete_list));
ASSERT(list_empty(&skc->skc_emergency_list));
kmem_free(skc->skc_name, skc->skc_name_size); kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock); spin_unlock(&skc->skc_lock);
@ -1581,59 +1667,112 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
} }
/* /*
* No available objects on any slabs, create a new slab. Since this * Generic slab allocation function to run by the global work queues.
* is an expensive operation we do it without holding the spin lock and * It is responsible for allocating a new slab, linking it in to the list
* only briefly acquire it when we link in the fully allocated and * of partial slabs, and then waking any waiters.
* constructed slab.
*/ */
static spl_kmem_slab_t * static void
spl_cache_grow(spl_kmem_cache_t *skc, int flags) spl_cache_grow_work(void *data)
{ {
spl_kmem_alloc_t *ska =
spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work);
spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks; spl_kmem_slab_t *sks;
sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
spin_lock(&skc->skc_lock);
if (sks) {
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
}
atomic_dec(&skc->skc_ref);
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
spin_unlock(&skc->skc_lock);
kfree(ska);
}
/*
* Returns non-zero when a new slab should be available.
*/
static int
spl_cache_grow_wait(spl_kmem_cache_t *skc)
{
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
}
/*
* No available objects on any slabs, create a new slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
int remaining, rc = 0;
SENTRY; SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_magic == SKC_MAGIC);
local_irq_enable();
might_sleep(); might_sleep();
*obj = NULL;
/* /*
* Before allocating a new slab check if the slab is being reaped. * Before allocating a new slab check if the slab is being reaped.
* If it is there is a good chance we can wait until it finishes * If it is there is a good chance we can wait until it finishes
* and then use one of the newly freed but not aged-out slabs. * and then use one of the newly freed but not aged-out slabs.
*/ */
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
schedule(); SRETURN(-EAGAIN);
SGOTO(out, sks= NULL);
/*
* This is handled by dispatching a work request to the global work
* queue. This allows us to asynchronously allocate a new slab while
* retaining the ability to safely fall back to a smaller synchronous
* allocations to ensure forward progress is always maintained.
*/
if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
spl_kmem_alloc_t *ska;
ska = kmalloc(sizeof(*ska), flags);
if (ska == NULL) {
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
SRETURN(-ENOMEM);
} }
/* Allocate a new slab for the cache */ atomic_inc(&skc->skc_ref);
sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG); ska->ska_cache = skc;
if (sks == NULL) ska->ska_flags = flags;
SGOTO(out, sks = NULL); spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska);
schedule_delayed_work(&ska->ska_work, 0);
/* Link the new empty slab in to the end of skc_partial_list. */
spin_lock(&skc->skc_lock);
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
spin_unlock(&skc->skc_lock);
out:
local_irq_disable();
SRETURN(sks);
} }
/* /*
* Refill a per-cpu magazine with objects from the slabs for this * Allow a single timer tick before falling back to synchronously
* cache. Ideally the magazine can be repopulated using existing * allocating the minimum about of memory required by the caller.
* objects which have been released, however if we are unable to
* locate enough free objects new slabs of objects will be created.
*/ */
static int remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), 1);
if (remaining == 0)
rc = spl_emergency_alloc(skc, flags, obj);
SRETURN(rc);
}
/*
* Refill a per-cpu magazine with objects from the slabs for this cache.
* Ideally the magazine can be repopulated using existing objects which have
* been released, however if we are unable to locate enough free objects new
* slabs of objects will be created. On success NULL is returned, otherwise
* the address of a single emergency object is returned for use by the caller.
*/
static void *
spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
{ {
spl_kmem_slab_t *sks; spl_kmem_slab_t *sks;
int rc = 0, refill; int count = 0, rc, refill;
void *obj = NULL;
SENTRY; SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_magic == SKC_MAGIC);
@ -1647,8 +1786,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
if (list_empty(&skc->skc_partial_list)) { if (list_empty(&skc->skc_partial_list)) {
spin_unlock(&skc->skc_lock); spin_unlock(&skc->skc_lock);
sks = spl_cache_grow(skc, flags); local_irq_enable();
if (!sks) rc = spl_cache_grow(skc, flags, &obj);
local_irq_disable();
/* Emergency object for immediate use by caller */
if (rc == 0 && obj != NULL)
SRETURN(obj);
if (rc)
SGOTO(out, rc); SGOTO(out, rc);
/* Rescheduled to different CPU skm is not local */ /* Rescheduled to different CPU skm is not local */
@ -1673,9 +1819,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
/* Consume as many objects as needed to refill the requested /* Consume as many objects as needed to refill the requested
* cache. We must also be careful not to overfill it. */ * cache. We must also be careful not to overfill it. */
while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) { while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
ASSERT(skm->skm_avail < skm->skm_size); ASSERT(skm->skm_avail < skm->skm_size);
ASSERT(rc < skm->skm_size); ASSERT(count < skm->skm_size);
skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
} }
@ -1688,8 +1834,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
spin_unlock(&skc->skc_lock); spin_unlock(&skc->skc_lock);
out: out:
/* Returns the number of entries added to cache */ SRETURN(NULL);
SRETURN(rc);
} }
/* /*
@ -1804,9 +1949,8 @@ restart:
obj = skm->skm_objs[--skm->skm_avail]; obj = skm->skm_objs[--skm->skm_avail];
skm->skm_age = jiffies; skm->skm_age = jiffies;
} else { } else {
/* Per-CPU cache empty, directly allocate from obj = spl_cache_refill(skc, skm, flags);
* the slab and refill the per-CPU cache. */ if (obj == NULL)
(void)spl_cache_refill(skc, skm, flags);
SGOTO(restart, obj = NULL); SGOTO(restart, obj = NULL);
} }
@ -1838,6 +1982,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
atomic_inc(&skc->skc_ref); atomic_inc(&skc->skc_ref);
/*
* Emergency objects are never part of the virtual address space
* so if we get a virtual address we can optimize this check out.
*/
if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
SGOTO(out, 0);
local_irq_save(flags); local_irq_save(flags);
/* Safe to update per-cpu structure without lock, but /* Safe to update per-cpu structure without lock, but
@ -1855,6 +2007,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
skm->skm_objs[skm->skm_avail++] = obj; skm->skm_objs[skm->skm_avail++] = obj;
local_irq_restore(flags); local_irq_restore(flags);
out:
atomic_dec(&skc->skc_ref); atomic_dec(&skc->skc_ref);
SEXIT; SEXIT;

View File

@ -625,12 +625,12 @@ slab_seq_show_headers(struct seq_file *f)
"--------------------- cache ----------" "--------------------- cache ----------"
"--------------------------------------------- " "--------------------------------------------- "
"----- slab ------ " "----- slab ------ "
"---- object -----\n"); "---- object -----------------\n");
seq_printf(f, seq_printf(f,
"name " "name "
" flags size alloc slabsize objsize " " flags size alloc slabsize objsize "
"total alloc max " "total alloc max "
"total alloc max\n"); "total alloc max emerg max\n");
} }
static int static int
@ -643,7 +643,7 @@ slab_seq_show(struct seq_file *f, void *p)
spin_lock(&skc->skc_lock); spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
"%5lu %5lu %5lu %5lu %5lu %5lu\n", "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
(long unsigned)skc->skc_flags, (long unsigned)skc->skc_flags,
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total), (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
@ -654,7 +654,9 @@ slab_seq_show(struct seq_file *f, void *p)
(long unsigned)skc->skc_slab_max, (long unsigned)skc->skc_slab_max,
(long unsigned)skc->skc_obj_total, (long unsigned)skc->skc_obj_total,
(long unsigned)skc->skc_obj_alloc, (long unsigned)skc->skc_obj_alloc,
(long unsigned)skc->skc_obj_max); (long unsigned)skc->skc_obj_max,
(long unsigned)skc->skc_obj_emergency,
(long unsigned)skc->skc_obj_emergency_max);
spin_unlock(&skc->skc_lock); spin_unlock(&skc->skc_lock);