mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 03:09:34 +03:00
Victory! I've reworked caches with large objects which are
based by vmalloc()'ed memory. I now alloc a slab which is roughly 32*spl_obj_size and in this block of memory I place the slab descriptor, slab object descriptors, and objects themselves. This greatly reduces vmalloc lock contention. Still some minor cleanup remains and fine tuning but it's working pretty well. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@139 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c
This commit is contained in:
parent
ff449ac406
commit
fece7c99bf
@ -485,7 +485,6 @@ typedef struct spl_kmem_magazine {
|
||||
|
||||
typedef struct spl_kmem_obj {
|
||||
uint32_t sko_magic; /* Sanity magic */
|
||||
uint32_t sko_flags; /* Per object flags */
|
||||
void *sko_addr; /* Buffer address */
|
||||
struct spl_kmem_slab *sko_slab; /* Owned by slab */
|
||||
struct list_head sko_list; /* Free object list linkage */
|
||||
|
@ -167,17 +167,9 @@ static struct shrinker spl_kmem_cache_shrinker = {
|
||||
};
|
||||
#endif
|
||||
|
||||
static spl_kmem_slab_t *
|
||||
spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
|
||||
spl_kmem_slab_t *sks;
|
||||
spl_kmem_obj_t *sko, *n;
|
||||
int i;
|
||||
ENTRY;
|
||||
|
||||
sks = kmem_cache_alloc(spl_slab_cache, flags);
|
||||
if (sks == NULL)
|
||||
RETURN(sks);
|
||||
|
||||
static void
|
||||
spl_slab_init(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
|
||||
{
|
||||
sks->sks_magic = SKS_MAGIC;
|
||||
sks->sks_objs = SPL_KMEM_CACHE_OBJ_PER_SLAB;
|
||||
sks->sks_age = jiffies;
|
||||
@ -185,91 +177,201 @@ spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
|
||||
INIT_LIST_HEAD(&sks->sks_list);
|
||||
INIT_LIST_HEAD(&sks->sks_free_list);
|
||||
sks->sks_ref = 0;
|
||||
}
|
||||
|
||||
static int
|
||||
spl_slab_alloc_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks, int flags)
|
||||
{
|
||||
spl_kmem_obj_t *sko, *n;
|
||||
int i, rc = 0;
|
||||
|
||||
/* This is based on the linux slab cache for now simply because
|
||||
* it means I get slab coloring, hardware cache alignment, etc
|
||||
* for free. There's no reason we can't do this ourselves. And
|
||||
* we probably should at in the future. For now I'll just
|
||||
* leverage the existing linux slab here. */
|
||||
for (i = 0; i < sks->sks_objs; i++) {
|
||||
sko = kmem_cache_alloc(spl_obj_cache, flags);
|
||||
if (sko == NULL) {
|
||||
out_alloc:
|
||||
/* Unable to fully construct slab, objects,
|
||||
* and object data buffers unwind everything.
|
||||
*/
|
||||
list_for_each_entry_safe(sko, n, &sks->sks_free_list,
|
||||
sko_list) {
|
||||
rc = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags);
|
||||
if (sko->sko_addr == NULL) {
|
||||
kmem_cache_free(spl_obj_cache, sko);
|
||||
rc = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
sko->sko_magic = SKO_MAGIC;
|
||||
sko->sko_slab = sks;
|
||||
INIT_LIST_HEAD(&sko->sko_list);
|
||||
INIT_HLIST_NODE(&sko->sko_hlist);
|
||||
list_add(&sko->sko_list, &sks->sks_free_list);
|
||||
}
|
||||
|
||||
/* Unable to fully construct slab, unwind everything */
|
||||
if (rc) {
|
||||
list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
|
||||
ASSERT(sko->sko_magic == SKO_MAGIC);
|
||||
vmem_free(sko->sko_addr, skc->skc_obj_size);
|
||||
kmem_free(sko->sko_addr, skc->skc_obj_size);
|
||||
list_del(&sko->sko_list);
|
||||
kmem_cache_free(spl_obj_cache, sko);
|
||||
}
|
||||
|
||||
kmem_cache_free(spl_slab_cache, sks);
|
||||
GOTO(out, sks = NULL);
|
||||
}
|
||||
|
||||
RETURN(rc);
|
||||
}
|
||||
|
||||
static spl_kmem_slab_t *
|
||||
spl_slab_alloc_vmem(spl_kmem_cache_t *skc, int flags)
|
||||
{
|
||||
spl_kmem_slab_t *sks;
|
||||
spl_kmem_obj_t *sko, *sko_base;
|
||||
void *slab, *obj, *obj_base;
|
||||
int i, size;
|
||||
|
||||
/* For large vmem_alloc'ed buffers it's important that we pack the
|
||||
* spl_kmem_obj_t structure and the actual objects in to one large
|
||||
* virtual address zone to minimize the number of calls to
|
||||
* vmalloc(). Mapping the virtual address in done under a single
|
||||
* global lock which walks a list of all virtual zones. So doing
|
||||
* lots of allocations simply results in lock contention and a
|
||||
* longer list of mapped addresses. It is far better to do a
|
||||
* few large allocations and then subdivide it ourselves. The
|
||||
* large vmem_alloc'ed space is divied as follows:
|
||||
*
|
||||
* 1 slab struct: sizeof(spl_kmem_slab_t)
|
||||
* N obj structs: sizeof(spl_kmem_obj_t) * skc->skc_objs
|
||||
* N objects: skc->skc_obj_size * skc->skc_objs
|
||||
*
|
||||
* XXX: It would probably be a good idea to more carefully
|
||||
* align the starts of these objects in memory.
|
||||
*/
|
||||
size = sizeof(spl_kmem_slab_t) + SPL_KMEM_CACHE_OBJ_PER_SLAB *
|
||||
(skc->skc_obj_size + sizeof(spl_kmem_obj_t));
|
||||
|
||||
slab = vmem_alloc(size, flags);
|
||||
if (slab == NULL)
|
||||
RETURN(NULL);
|
||||
|
||||
sks = (spl_kmem_slab_t *)slab;
|
||||
spl_slab_init(skc, sks);
|
||||
|
||||
sko_base = (spl_kmem_obj_t *)(slab + sizeof(spl_kmem_slab_t));
|
||||
obj_base = (void *)sko_base + sizeof(spl_kmem_obj_t) * sks->sks_objs;
|
||||
|
||||
for (i = 0; i < sks->sks_objs; i++) {
|
||||
sko = &sko_base[i];
|
||||
obj = obj_base + skc->skc_obj_size * i;
|
||||
sko->sko_addr = obj;
|
||||
sko->sko_magic = SKO_MAGIC;
|
||||
sko->sko_slab = sks;
|
||||
INIT_LIST_HEAD(&sko->sko_list);
|
||||
INIT_HLIST_NODE(&sko->sko_hlist);
|
||||
list_add_tail(&sko->sko_list, &sks->sks_free_list);
|
||||
}
|
||||
|
||||
RETURN(sks);
|
||||
}
|
||||
|
||||
static spl_kmem_slab_t *
|
||||
spl_slab_alloc(spl_kmem_cache_t *skc, int flags) {
|
||||
spl_kmem_slab_t *sks;
|
||||
spl_kmem_obj_t *sko;
|
||||
int rc;
|
||||
ENTRY;
|
||||
|
||||
/* Objects less than a page can use kmem_alloc() and avoid
|
||||
* the locking overhead in __get_vm_area_node() when locking
|
||||
* for a free address. For objects over a page we use
|
||||
* vmem_alloc() because it is usually worth paying this
|
||||
* overhead to avoid the need to find contigeous pages.
|
||||
* This should give us the best of both worlds. */
|
||||
if (skc->skc_obj_size <= PAGE_SIZE)
|
||||
sko->sko_addr = kmem_alloc(skc->skc_obj_size, flags);
|
||||
else
|
||||
sko->sko_addr = vmem_alloc(skc->skc_obj_size, flags);
|
||||
if (skc->skc_obj_size <= PAGE_SIZE) {
|
||||
sks = kmem_cache_alloc(spl_slab_cache, flags);
|
||||
if (sks == NULL)
|
||||
GOTO(out, sks = NULL);
|
||||
|
||||
if (sko->sko_addr == NULL) {
|
||||
kmem_cache_free(spl_obj_cache, sko);
|
||||
GOTO(out_alloc, sks = NULL);
|
||||
spl_slab_init(skc, sks);
|
||||
|
||||
rc = spl_slab_alloc_kmem(skc, sks, flags);
|
||||
if (rc) {
|
||||
kmem_cache_free(spl_slab_cache, sks);
|
||||
GOTO(out, sks = NULL);
|
||||
}
|
||||
} else {
|
||||
sks = spl_slab_alloc_vmem(skc, flags);
|
||||
if (sks == NULL)
|
||||
GOTO(out, sks = NULL);
|
||||
}
|
||||
|
||||
sko->sko_magic = SKO_MAGIC;
|
||||
sko->sko_flags = 0;
|
||||
sko->sko_slab = sks;
|
||||
INIT_LIST_HEAD(&sko->sko_list);
|
||||
INIT_HLIST_NODE(&sko->sko_hlist);
|
||||
list_add(&sko->sko_list, &sks->sks_free_list);
|
||||
}
|
||||
ASSERT(sks);
|
||||
list_for_each_entry(sko, &sks->sks_free_list, sko_list)
|
||||
if (skc->skc_ctor)
|
||||
skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
|
||||
out:
|
||||
RETURN(sks);
|
||||
}
|
||||
|
||||
static void
|
||||
spl_slab_free_kmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
|
||||
{
|
||||
spl_kmem_obj_t *sko, *n;
|
||||
|
||||
ASSERT(skc->skc_magic == SKC_MAGIC);
|
||||
ASSERT(sks->sks_magic == SKS_MAGIC);
|
||||
|
||||
list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
|
||||
ASSERT(sko->sko_magic == SKO_MAGIC);
|
||||
kmem_free(sko->sko_addr, skc->skc_obj_size);
|
||||
list_del(&sko->sko_list);
|
||||
kmem_cache_free(spl_obj_cache, sko);
|
||||
}
|
||||
|
||||
kmem_cache_free(spl_slab_cache, sks);
|
||||
}
|
||||
|
||||
static void
|
||||
spl_slab_free_vmem(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
|
||||
{
|
||||
ASSERT(skc->skc_magic == SKC_MAGIC);
|
||||
ASSERT(sks->sks_magic == SKS_MAGIC);
|
||||
|
||||
vmem_free(sks, SPL_KMEM_CACHE_OBJ_PER_SLAB *
|
||||
(skc->skc_obj_size + sizeof(spl_kmem_obj_t)));
|
||||
}
|
||||
|
||||
/* Removes slab from complete or partial list, so it must
|
||||
* be called with the 'skc->skc_lock' held.
|
||||
* */
|
||||
*/
|
||||
static void
|
||||
spl_slab_free(spl_kmem_slab_t *sks) {
|
||||
spl_kmem_cache_t *skc;
|
||||
spl_kmem_obj_t *sko, *n;
|
||||
int i = 0;
|
||||
ENTRY;
|
||||
|
||||
ASSERT(sks->sks_magic == SKS_MAGIC);
|
||||
ASSERT(sks->sks_ref == 0);
|
||||
skc = sks->sks_cache;
|
||||
skc->skc_obj_total -= sks->sks_objs;
|
||||
skc->skc_slab_total--;
|
||||
|
||||
skc = sks->sks_cache;
|
||||
ASSERT(skc->skc_magic == SKC_MAGIC);
|
||||
ASSERT(spin_is_locked(&skc->skc_lock));
|
||||
|
||||
list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) {
|
||||
ASSERT(sko->sko_magic == SKO_MAGIC);
|
||||
skc->skc_obj_total -= sks->sks_objs;
|
||||
skc->skc_slab_total--;
|
||||
list_del(&sks->sks_list);
|
||||
|
||||
/* Run destructors for being freed */
|
||||
/* Run destructors slab is being released */
|
||||
list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list)
|
||||
if (skc->skc_dtor)
|
||||
skc->skc_dtor(sko->sko_addr, skc->skc_private);
|
||||
|
||||
if (skc->skc_obj_size <= PAGE_SIZE)
|
||||
kmem_free(sko->sko_addr, skc->skc_obj_size);
|
||||
spl_slab_free_kmem(skc, sks);
|
||||
else
|
||||
vmem_free(sko->sko_addr, skc->skc_obj_size);
|
||||
|
||||
list_del(&sko->sko_list);
|
||||
kmem_cache_free(spl_obj_cache, sko);
|
||||
i++;
|
||||
}
|
||||
|
||||
ASSERT(sks->sks_objs == i);
|
||||
list_del(&sks->sks_list);
|
||||
kmem_cache_free(spl_slab_cache, sks);
|
||||
spl_slab_free_vmem(skc, sks);
|
||||
|
||||
EXIT;
|
||||
}
|
||||
@ -629,14 +731,13 @@ static spl_kmem_slab_t *
|
||||
spl_cache_grow(spl_kmem_cache_t *skc, int flags)
|
||||
{
|
||||
spl_kmem_slab_t *sks;
|
||||
spl_kmem_obj_t *sko;
|
||||
cycles_t start;
|
||||
ENTRY;
|
||||
|
||||
ASSERT(skc->skc_magic == SKC_MAGIC);
|
||||
|
||||
if (flags & __GFP_WAIT) {
|
||||
// flags |= __GFP_NOFAIL; /* XXX: Solaris assumes this */
|
||||
flags |= __GFP_NOFAIL;
|
||||
might_sleep();
|
||||
local_irq_enable();
|
||||
}
|
||||
@ -649,14 +750,6 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags)
|
||||
RETURN(NULL);
|
||||
}
|
||||
|
||||
/* Run all the constructors now that the slab is fully allocated */
|
||||
list_for_each_entry(sko, &sks->sks_free_list, sko_list) {
|
||||
ASSERT(sko->sko_magic == SKO_MAGIC);
|
||||
|
||||
if (skc->skc_ctor)
|
||||
skc->skc_ctor(sko->sko_addr, skc->skc_private, flags);
|
||||
}
|
||||
|
||||
if (flags & __GFP_WAIT)
|
||||
local_irq_disable();
|
||||
|
||||
@ -697,7 +790,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
|
||||
if (list_empty(&skc->skc_partial_list)) {
|
||||
spin_unlock(&skc->skc_lock);
|
||||
|
||||
if (unlikely((get_cycles() - start) > skc->skc_lock_refill))
|
||||
if (unlikely((get_cycles()-start)>skc->skc_lock_refill))
|
||||
skc->skc_lock_refill = get_cycles() - start;
|
||||
|
||||
sks = spl_cache_grow(skc, flags);
|
||||
@ -861,6 +954,7 @@ restart:
|
||||
}
|
||||
|
||||
local_irq_restore(irq_flags);
|
||||
ASSERT(obj);
|
||||
|
||||
/* Pre-emptively migrate object to CPU L1 cache */
|
||||
prefetchw(obj);
|
||||
|
@ -559,32 +559,32 @@ splat_kmem_test8_count(kmem_cache_priv_t *kcp, int threads)
|
||||
* eyeball the slab cache locking overhead to ensure it is reasonable.
|
||||
*/
|
||||
static int
|
||||
splat_kmem_test8(struct file *file, void *arg)
|
||||
splat_kmem_test8_sc(struct file *file, void *arg, int size, int count)
|
||||
{
|
||||
kmem_cache_priv_t kcp;
|
||||
kthread_t *thr;
|
||||
struct timespec start, stop, delta;
|
||||
char cache_name[16];
|
||||
int alloc, i;
|
||||
char cache_name[32];
|
||||
int i, j, threads = 32;
|
||||
|
||||
kcp.kcp_magic = SPLAT_KMEM_TEST_MAGIC;
|
||||
kcp.kcp_file = file;
|
||||
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s",
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %s", "name",
|
||||
"time (sec)\tslabs \tobjs \thash\n");
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%s",
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %s", "",
|
||||
" \ttot/max/calc\ttot/max/calc\tsize/depth\n");
|
||||
|
||||
for (alloc = 1; alloc <= 4096; alloc *= 2) {
|
||||
kcp.kcp_size = 256;
|
||||
for (i = 1; i <= count; i *= 2) {
|
||||
kcp.kcp_size = size;
|
||||
kcp.kcp_count = 0;
|
||||
kcp.kcp_threads = 0;
|
||||
kcp.kcp_alloc = alloc;
|
||||
kcp.kcp_alloc = i;
|
||||
kcp.kcp_rc = 0;
|
||||
spin_lock_init(&kcp.kcp_lock);
|
||||
init_waitqueue_head(&kcp.kcp_waitq);
|
||||
|
||||
sprintf(cache_name, "%s-%d", SPLAT_KMEM_CACHE_NAME, alloc);
|
||||
sprintf(cache_name, "%s-%d-%d", SPLAT_KMEM_CACHE_NAME, size, i);
|
||||
kcp.kcp_cache = kmem_cache_create(cache_name, kcp.kcp_size, 0,
|
||||
splat_kmem_cache_test_constructor,
|
||||
splat_kmem_cache_test_destructor,
|
||||
@ -598,7 +598,7 @@ splat_kmem_test8(struct file *file, void *arg)
|
||||
|
||||
start = current_kernel_time();
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
for (j = 0; j < threads; j++) {
|
||||
thr = thread_create(NULL, 0, splat_kmem_test8_thread,
|
||||
&kcp, 0, &p0, TS_RUN, minclsyspri);
|
||||
ASSERT(thr != NULL);
|
||||
@ -610,15 +610,17 @@ splat_kmem_test8(struct file *file, void *arg)
|
||||
stop = current_kernel_time();
|
||||
delta = timespec_sub(stop, start);
|
||||
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%2ld.%09ld\t"
|
||||
splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "%-22s %2ld.%09ld\t"
|
||||
"%lu/%lu/%lu\t%lu/%lu/%lu\t%lu/%lu\n",
|
||||
kcp.kcp_cache->skc_name,
|
||||
delta.tv_sec, delta.tv_nsec,
|
||||
(unsigned long)kcp.kcp_cache->skc_slab_total,
|
||||
(unsigned long)kcp.kcp_cache->skc_slab_max,
|
||||
(unsigned long)(kcp.kcp_alloc * 32 / SPL_KMEM_CACHE_OBJ_PER_SLAB),
|
||||
(unsigned long)(kcp.kcp_alloc * threads /
|
||||
SPL_KMEM_CACHE_OBJ_PER_SLAB),
|
||||
(unsigned long)kcp.kcp_cache->skc_obj_total,
|
||||
(unsigned long)kcp.kcp_cache->skc_obj_max,
|
||||
(unsigned long)(kcp.kcp_alloc * 32),
|
||||
(unsigned long)(kcp.kcp_alloc * threads),
|
||||
(unsigned long)kcp.kcp_cache->skc_hash_size,
|
||||
(unsigned long)kcp.kcp_cache->skc_hash_depth);
|
||||
|
||||
@ -631,6 +633,22 @@ splat_kmem_test8(struct file *file, void *arg)
|
||||
return kcp.kcp_rc;
|
||||
}
|
||||
|
||||
static int
|
||||
splat_kmem_test8(struct file *file, void *arg)
|
||||
{
|
||||
int i, rc = 0;
|
||||
|
||||
/* Run through slab cache with objects size from
|
||||
* 16-1Mb in 4x multiples with 1024 objects each */
|
||||
for (i = 16; i <= 1024*1024; i *= 4) {
|
||||
rc = splat_kmem_test8_sc(file, arg, i, 1024);
|
||||
if (rc)
|
||||
break;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
splat_subsystem_t *
|
||||
splat_kmem_init(void)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user