From 9b1b8e4c2459a02fc230cdda65b13908f263fd36 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 17 Feb 2009 15:52:18 -0800 Subject: [PATCH] kmem slab magazine ageing deadlock - The previous magazine ageing sceme relied on the on_each_cpu() function to call spl_magazine_age() on each cpu. It turns out this could deadlock with do_flush_tlb_all() which also relies on the IPI based on_each_cpu(). To avoid this problem a per- magazine delayed work item is created and indepentantly scheduled to the correct cpu removing the need for on_each_cpu(). - Additionally two unused fields were removed from the type spl_kmem_cache_t, they were hold overs from previous cleanup. - struct work_struct work - struct timer_list timer --- include/sys/kmem.h | 6 +++--- module/spl/spl-kmem.c | 28 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/sys/kmem.h b/include/sys/kmem.h index dc66a9153..cad652c91 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -250,10 +250,12 @@ typedef void (*spl_kmem_dtor_t)(void *, void *); typedef void (*spl_kmem_reclaim_t)(void *); typedef struct spl_kmem_magazine { - uint32_t skm_magic; /* Sanity magic */ + uint32_t skm_magic; /* Sanity magic */ uint32_t skm_avail; /* Available objects */ uint32_t skm_size; /* Magazine size */ uint32_t skm_refill; /* Batch refill size */ + struct spl_kmem_cache *skm_cache; /* Owned by cache */ + struct delayed_work skm_work; /* Magazine reclaim work */ unsigned long skm_age; /* Last cache access */ void *skm_objs[0]; /* Object pointers */ } spl_kmem_magazine_t; @@ -296,8 +298,6 @@ typedef struct spl_kmem_cache { uint32_t skc_reap; /* Slab reclaim count */ atomic_t skc_ref; /* Ref count callers */ struct delayed_work skc_work; /* Slab reclaim work */ - struct work_struct work; - struct timer_list timer; struct list_head skc_list; /* List of caches linkage */ struct list_head skc_complete_list;/* Completely alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */ diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index ba7e19b4e..18613e799 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -932,12 +932,22 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) static void spl_magazine_age(void *data) { - spl_kmem_cache_t *skc = data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + spl_kmem_magazine_t *skm = + spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work); + spl_kmem_cache_t *skc = skm->skm_cache; + int i = smp_processor_id(); + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_mag[i] == skm); if (skm->skm_avail > 0 && time_after(jiffies, skm->skm_age + skc->skc_delay * HZ)) (void)spl_cache_flush(skc, skm, skm->skm_refill); + + if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) + schedule_delayed_work_on(i, &skm->skm_work, + skc->skc_delay / 3 * HZ); } /* @@ -949,12 +959,11 @@ spl_magazine_age(void *data) static void spl_cache_age(void *data) { - spl_kmem_cache_t *skc = + spl_kmem_cache_t *skc = spl_get_work_data(data, spl_kmem_cache_t, skc_work.work); ASSERT(skc->skc_magic == SKC_MAGIC); spl_slab_reclaim(skc, skc->skc_reap, 0); - spl_on_each_cpu(spl_magazine_age, skc, 0); if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); @@ -1050,6 +1059,8 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node) skm->skm_avail = 0; skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm); skm->skm_age = jiffies; } @@ -1095,6 +1106,11 @@ spl_magazine_create(spl_kmem_cache_t *skc) } } + /* Only after everything is allocated schedule magazine work */ + for_each_online_cpu(i) + schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work, + skc->skc_delay / 3 * HZ); + RETURN(0); } @@ -1245,6 +1261,7 @@ void spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { DECLARE_WAIT_QUEUE_HEAD(wq); + int i; ENTRY; ASSERT(skc->skc_magic == SKC_MAGIC); @@ -1256,6 +1273,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) /* Cancel any and wait for any pending delayed work */ ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); cancel_delayed_work(&skc->skc_work); + for_each_online_cpu(i) + cancel_delayed_work(&skc->skc_mag[i]->skm_work); + flush_scheduled_work(); /* Wait until all current callers complete, this is mainly