kmem-cache: Use taskqs for ageing

Shift the cache and magazine ageing functionality over to the new
delayed taskq interfaces.  This allows us to abandon the kernels
delayed work queue interface and all the compatibility code it
requires.

However, the delayed taskq interface does not allow us to schedule
a task for a specfic cpu so the ageing code was slightly reworked.
The magazine ageing delay has been directly linked to the cache
ageing function.  The spl_cache_age() function invokes on_each_cpu()
in order to run spl_magazine_age() on each cpu.  It then blocks
waiting for them to complete and promptly reclaims any free slabs.

When restructing the code wasn't the primary goal I think the
new code is far more understable and maintainable.  It also should
help minimize magazine thrashing because free slabs are immediately
released after the magazine is aged.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2012-12-10 10:53:46 -08:00
parent 296a8e596d
commit a10287e00d
2 changed files with 52 additions and 43 deletions

View File

@ -37,6 +37,7 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/vmsystm.h> #include <sys/vmsystm.h>
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/taskq.h>
/* /*
* Memory allocation interfaces * Memory allocation interfaces
@ -406,7 +407,6 @@ typedef struct spl_kmem_magazine {
uint32_t skm_size; /* Magazine size */ uint32_t skm_size; /* Magazine size */
uint32_t skm_refill; /* Batch refill size */ uint32_t skm_refill; /* Batch refill size */
struct spl_kmem_cache *skm_cache; /* Owned by cache */ struct spl_kmem_cache *skm_cache; /* Owned by cache */
struct delayed_work skm_work; /* Magazine reclaim work */
unsigned long skm_age; /* Last cache access */ unsigned long skm_age; /* Last cache access */
unsigned int skm_cpu; /* Owned by cpu */ unsigned int skm_cpu; /* Owned by cpu */
void *skm_objs[0]; /* Object pointers */ void *skm_objs[0]; /* Object pointers */
@ -460,7 +460,7 @@ typedef struct spl_kmem_cache {
uint32_t skc_delay; /* Slab reclaim interval */ uint32_t skc_delay; /* Slab reclaim interval */
uint32_t skc_reap; /* Slab reclaim count */ uint32_t skc_reap; /* Slab reclaim count */
atomic_t skc_ref; /* Ref count callers */ atomic_t skc_ref; /* Ref count callers */
struct delayed_work skc_work; /* Slab reclaim work */ taskqid_t skc_taskqid; /* Slab reclaim task */
struct list_head skc_list; /* List of caches linkage */ struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */ struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */

View File

@ -825,6 +825,7 @@ EXPORT_SYMBOL(vmem_free_debug);
struct list_head spl_kmem_cache_list; /* List of caches */ struct list_head spl_kmem_cache_list; /* List of caches */
struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
static int spl_cache_flush(spl_kmem_cache_t *skc, static int spl_cache_flush(spl_kmem_cache_t *skc,
spl_kmem_magazine_t *skm, int flush); spl_kmem_magazine_t *skm, int flush);
@ -1243,50 +1244,59 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
SRETURN(0); SRETURN(0);
} }
/*
* Called regularly on all caches to age objects out of the magazines
* which have not been access in skc->skc_delay seconds. This prevents
* idle magazines from holding memory which might be better used by
* other caches or parts of the system. The delay is present to
* prevent thrashing the magazine.
*/
static void static void
spl_magazine_age(void *data) spl_magazine_age(void *data)
{ {
spl_kmem_magazine_t *skm = spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work); spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
spl_kmem_cache_t *skc = skm->skm_cache;
ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC);
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_cpu == smp_processor_id());
ASSERT(skc->skc_mag[skm->skm_cpu] == skm);
if (skm->skm_avail > 0 && if (skm->skm_avail > 0)
time_after(jiffies, skm->skm_age + skc->skc_delay * HZ)) if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
(void)spl_cache_flush(skc, skm, skm->skm_refill); (void) spl_cache_flush(skc, skm, skm->skm_refill);
if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
schedule_delayed_work_on(skm->skm_cpu, &skm->skm_work,
skc->skc_delay / 3 * HZ);
} }
/* /*
* Called regularly to keep a downward pressure on the size of idle * Called regularly to keep a downward pressure on the cache.
* magazines and to release free slabs from the cache. This function *
* never calls the registered reclaim function, that only occurs * Objects older than skc->skc_delay seconds in the per-cpu magazines will
* under memory pressure or with a direct call to spl_kmem_reap(). * be returned to the caches. This is done to prevent idle magazines from
* holding memory which could be better used elsewhere. The delay is
* present to prevent thrashing the magazine.
*
* The newly released objects may result in empty partial slabs. Those
* slabs should be released to the system. Otherwise moving the objects
* out of the magazines is just wasted work.
*/ */
static void static void
spl_cache_age(void *data) spl_cache_age(void *data)
{ {
spl_kmem_cache_t *skc = spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
spl_get_work_data(data, spl_kmem_cache_t, skc_work.work); taskqid_t id = 0;
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_magic == SKC_MAGIC);
atomic_inc(&skc->skc_ref);
spl_on_each_cpu(spl_magazine_age, skc, 1);
spl_slab_reclaim(skc, skc->skc_reap, 0); spl_slab_reclaim(skc, skc->skc_reap, 0);
if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); id = taskq_dispatch_delay(
spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
/* Destroy issued after dispatch immediately cancel it */
if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
taskq_cancel_id(spl_kmem_cache_taskq, id);
}
spin_lock(&skc->skc_lock);
skc->skc_taskqid = id;
spin_unlock(&skc->skc_lock);
atomic_dec(&skc->skc_ref);
} }
/* /*
@ -1380,7 +1390,6 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
skm->skm_size = skc->skc_mag_size; skm->skm_size = skc->skc_mag_size;
skm->skm_refill = skc->skc_mag_refill; skm->skm_refill = skc->skc_mag_refill;
skm->skm_cache = skc; skm->skm_cache = skc;
spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm);
skm->skm_age = jiffies; skm->skm_age = jiffies;
skm->skm_cpu = cpu; skm->skm_cpu = cpu;
} }
@ -1427,11 +1436,6 @@ spl_magazine_create(spl_kmem_cache_t *skc)
} }
} }
/* Only after everything is allocated schedule magazine work */
for_each_online_cpu(i)
schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work,
skc->skc_delay / 3 * HZ);
SRETURN(0); SRETURN(0);
} }
@ -1566,8 +1570,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
if (rc) if (rc)
SGOTO(out, rc); SGOTO(out, rc);
spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc); skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); spl_cache_age, skc, TQ_SLEEP,
ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
down_write(&spl_kmem_cache_sem); down_write(&spl_kmem_cache_sem);
list_add_tail(&skc->skc_list, &spl_kmem_cache_list); list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@ -1600,7 +1605,7 @@ void
spl_kmem_cache_destroy(spl_kmem_cache_t *skc) spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
{ {
DECLARE_WAIT_QUEUE_HEAD(wq); DECLARE_WAIT_QUEUE_HEAD(wq);
int i; taskqid_t id;
SENTRY; SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_magic == SKC_MAGIC);
@ -1609,13 +1614,14 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
list_del_init(&skc->skc_list); list_del_init(&skc->skc_list);
up_write(&spl_kmem_cache_sem); up_write(&spl_kmem_cache_sem);
/* Cancel any and wait for any pending delayed work */ /* Cancel any and wait for any pending delayed tasks */
VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
cancel_delayed_work_sync(&skc->skc_work);
for_each_online_cpu(i)
cancel_delayed_work_sync(&skc->skc_mag[i]->skm_work);
flush_scheduled_work(); spin_lock(&skc->skc_lock);
id = skc->skc_taskqid;
spin_unlock(&skc->skc_lock);
taskq_cancel_id(spl_kmem_cache_taskq, id);
/* Wait until all current callers complete, this is mainly /* Wait until all current callers complete, this is mainly
* to catch the case where a low memory situation triggers a * to catch the case where a low memory situation triggers a
@ -2394,6 +2400,8 @@ spl_kmem_init(void)
init_rwsem(&spl_kmem_cache_sem); init_rwsem(&spl_kmem_cache_sem);
INIT_LIST_HEAD(&spl_kmem_cache_list); INIT_LIST_HEAD(&spl_kmem_cache_list);
spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE);
spl_register_shrinker(&spl_kmem_cache_shrinker); spl_register_shrinker(&spl_kmem_cache_shrinker);
@ -2432,6 +2440,7 @@ spl_kmem_fini(void)
SENTRY; SENTRY;
spl_unregister_shrinker(&spl_kmem_cache_shrinker); spl_unregister_shrinker(&spl_kmem_cache_shrinker);
taskq_destroy(spl_kmem_cache_taskq);
SEXIT; SEXIT;
} }