mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	kmem_cache hardening and performance improvements
- Added slab work queue task which gradually ages and free's slabs from the cache which have not been used recently. - Optimized slab packing algorithm to ensure each slab contains the maximum number of objects without create to large a slab. - Fix deadlock, we can never call kv_free() under the skc_lock. We now unlink the objects and slabs from the cache itself and attach them to a private work list. The contents of the list are then subsequently freed outside the spin lock. - Move magazine create/destroy operation on to local cpu. - Further performace optimizations by minimize the usage of the large per-cache skc_lock. This includes the addition of KMC_BIT_REAPING bit mask which is used to prevent concurrent reaping, and to defer new slab creation when reaping is occuring. - Add KMC_BIT_DESTROYING bit mask which is set when the cache is being destroyed, this is used to catch any task accessing the cache while it is being destroyed. - Add comments to all the functions and additional comments to try and make everything as clear as possible. - Major cleanup and additions to the SPLAT kmem tests to more rigerously stress the cache implementation and look for any problems. This includes correctness and performance tests. - Updated portable work queue interfaces
This commit is contained in:
		
							parent
							
								
									34e71c9e97
								
							
						
					
					
						commit
						ea3e6ca9e5
					
				| @ -45,6 +45,7 @@ extern "C" { | |||||||
| #include <asm/atomic_compat.h> | #include <asm/atomic_compat.h> | ||||||
| #include <sys/types.h> | #include <sys/types.h> | ||||||
| #include <sys/debug.h> | #include <sys/debug.h> | ||||||
|  | #include <sys/workqueue.h> | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Memory allocation interfaces |  * Memory allocation interfaces | ||||||
| @ -161,17 +162,32 @@ kmem_alloc_tryhard(size_t size, size_t *alloc_size, int kmflags) | |||||||
| /*
 | /*
 | ||||||
|  * Slab allocation interfaces |  * Slab allocation interfaces | ||||||
|  */ |  */ | ||||||
| #define KMC_NOTOUCH                     0x00000001 | enum { | ||||||
| #define KMC_NODEBUG                     0x00000002 /* Default behavior */ | 	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */ | ||||||
| #define KMC_NOMAGAZINE                  0x00000004 /* XXX: No disable support available */ | 	KMC_BIT_NODEBUG		= 1,	/* Default behavior */ | ||||||
| #define KMC_NOHASH                      0x00000008 /* XXX: No hash available */ | 	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */ | ||||||
| #define KMC_QCACHE                      0x00000010 /* XXX: Unsupported */ | 	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */ | ||||||
| #define KMC_KMEM			0x00000100 /* Use kmem cache */ | 	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */ | ||||||
| #define KMC_VMEM			0x00000200 /* Use vmem cache */ | 	KMC_BIT_KMEM		= 5,	/* Use kmem cache */ | ||||||
| #define KMC_OFFSLAB			0x00000400 /* Objects not on slab */ | 	KMC_BIT_VMEM		= 6,	/* Use vmem cache */ | ||||||
|  | 	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */ | ||||||
|  | 	KMC_BIT_REAPING		= 16,	/* Reaping in progress */ | ||||||
|  | 	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */ | ||||||
|  | }; | ||||||
| 
 | 
 | ||||||
| #define KMC_REAP_CHUNK                  256 | #define KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH) | ||||||
| #define KMC_DEFAULT_SEEKS               DEFAULT_SEEKS | #define KMC_NODEBUG		(1 << KMC_BIT_NODEBUG) | ||||||
|  | #define KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE) | ||||||
|  | #define KMC_NOHASH		(1 << KMC_BIT_NOHASH) | ||||||
|  | #define KMC_QCACHE		(1 << KMC_BIT_QCACHE) | ||||||
|  | #define KMC_KMEM		(1 << KMC_BIT_KMEM) | ||||||
|  | #define KMC_VMEM		(1 << KMC_BIT_VMEM) | ||||||
|  | #define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB) | ||||||
|  | #define KMC_REAPING		(1 << KMC_BIT_REAPING) | ||||||
|  | #define KMC_DESTROY		(1 << KMC_BIT_DESTROY) | ||||||
|  | 
 | ||||||
|  | #define KMC_REAP_CHUNK			INT_MAX | ||||||
|  | #define KMC_DEFAULT_SEEKS		1 | ||||||
| 
 | 
 | ||||||
| #ifdef DEBUG_KMEM_UNIMPLEMENTED | #ifdef DEBUG_KMEM_UNIMPLEMENTED | ||||||
| static __inline__ void kmem_init(void) { | static __inline__ void kmem_init(void) { | ||||||
| @ -223,9 +239,10 @@ extern struct rw_semaphore spl_kmem_cache_sem; | |||||||
| #define SKS_MAGIC			0x22222222 | #define SKS_MAGIC			0x22222222 | ||||||
| #define SKC_MAGIC			0x2c2c2c2c | #define SKC_MAGIC			0x2c2c2c2c | ||||||
| 
 | 
 | ||||||
| #define SPL_KMEM_CACHE_DELAY		5 | #define SPL_KMEM_CACHE_DELAY		5	/* Minimum slab release age */ | ||||||
| #define SPL_KMEM_CACHE_OBJ_PER_SLAB	32 | #define SPL_KMEM_CACHE_OBJ_PER_SLAB	32	/* Target objects per slab */ | ||||||
| #define SPL_KMEM_CACHE_ALIGN		8 | #define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	8	/* Minimum objects per slab */ | ||||||
|  | #define SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */ | ||||||
| 
 | 
 | ||||||
| typedef int (*spl_kmem_ctor_t)(void *, void *, int); | typedef int (*spl_kmem_ctor_t)(void *, void *, int); | ||||||
| typedef void (*spl_kmem_dtor_t)(void *, void *); | typedef void (*spl_kmem_dtor_t)(void *, void *); | ||||||
| @ -258,24 +275,28 @@ typedef struct spl_kmem_slab { | |||||||
| } spl_kmem_slab_t; | } spl_kmem_slab_t; | ||||||
| 
 | 
 | ||||||
| typedef struct spl_kmem_cache { | typedef struct spl_kmem_cache { | ||||||
|         uint32_t		skc_magic;	/* Sanity magic */ | 	uint32_t		skc_magic;	/* Sanity magic */ | ||||||
|         uint32_t		skc_name_size;	/* Name length */ | 	uint32_t		skc_name_size;	/* Name length */ | ||||||
|         char			*skc_name;	/* Name string */ | 	char			*skc_name;	/* Name string */ | ||||||
| 	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */ | 	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */ | ||||||
| 	uint32_t		skc_mag_size;	/* Magazine size */ | 	uint32_t		skc_mag_size;	/* Magazine size */ | ||||||
| 	uint32_t		skc_mag_refill;	/* Magazine refill count */ | 	uint32_t		skc_mag_refill;	/* Magazine refill count */ | ||||||
|         spl_kmem_ctor_t		skc_ctor;	/* Constructor */ | 	spl_kmem_ctor_t		skc_ctor;	/* Constructor */ | ||||||
|         spl_kmem_dtor_t		skc_dtor;	/* Destructor */ | 	spl_kmem_dtor_t		skc_dtor;	/* Destructor */ | ||||||
|         spl_kmem_reclaim_t      skc_reclaim;	/* Reclaimator */ | 	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */ | ||||||
|         void			*skc_private;	/* Private data */ | 	void			*skc_private;	/* Private data */ | ||||||
|         void			*skc_vmp;	/* Unused */ | 	void			*skc_vmp;	/* Unused */ | ||||||
| 	uint32_t		skc_flags;	/* Flags */ | 	uint32_t		skc_flags;	/* Flags */ | ||||||
| 	uint32_t		skc_obj_size;	/* Object size */ | 	uint32_t		skc_obj_size;	/* Object size */ | ||||||
| 	uint32_t		skc_obj_align;	/* Object alignment */ | 	uint32_t		skc_obj_align;	/* Object alignment */ | ||||||
| 	uint32_t		skc_slab_objs;	/* Objects per slab */ | 	uint32_t		skc_slab_objs;	/* Objects per slab */ | ||||||
| 	uint32_t		skc_slab_size;  /* Slab size */ | 	uint32_t		skc_slab_size;	/* Slab size */ | ||||||
| 	uint32_t		skc_delay;	/* slab reclaim interval */ | 	uint32_t		skc_delay;	/* Slab reclaim interval */ | ||||||
|         struct list_head	skc_list;	/* List of caches linkage */ | 	atomic_t		skc_ref;	/* Ref count callers */ | ||||||
|  | 	struct delayed_work	skc_work;	/* Slab reclaim work */ | ||||||
|  |         struct work_struct work; | ||||||
|  |         struct timer_list timer; | ||||||
|  | 	struct list_head	skc_list;	/* List of caches linkage */ | ||||||
| 	struct list_head	skc_complete_list;/* Completely alloc'ed */ | 	struct list_head	skc_complete_list;/* Completely alloc'ed */ | ||||||
| 	struct list_head	skc_partial_list; /* Partially alloc'ed */ | 	struct list_head	skc_partial_list; /* Partially alloc'ed */ | ||||||
| 	spinlock_t		skc_lock;	/* Cache lock */ | 	spinlock_t		skc_lock;	/* Cache lock */ | ||||||
| @ -283,7 +304,7 @@ typedef struct spl_kmem_cache { | |||||||
| 	uint64_t		skc_slab_create;/* Slab creates */ | 	uint64_t		skc_slab_create;/* Slab creates */ | ||||||
| 	uint64_t		skc_slab_destroy;/* Slab destroys */ | 	uint64_t		skc_slab_destroy;/* Slab destroys */ | ||||||
| 	uint64_t		skc_slab_total;	/* Slab total current */ | 	uint64_t		skc_slab_total;	/* Slab total current */ | ||||||
| 	uint64_t		skc_slab_alloc; /* Slab alloc current */ | 	uint64_t		skc_slab_alloc;	/* Slab alloc current */ | ||||||
| 	uint64_t		skc_slab_max;	/* Slab max historic  */ | 	uint64_t		skc_slab_max;	/* Slab max historic  */ | ||||||
| 	uint64_t		skc_obj_total;	/* Obj total current */ | 	uint64_t		skc_obj_total;	/* Obj total current */ | ||||||
| 	uint64_t		skc_obj_alloc;	/* Obj alloc current */ | 	uint64_t		skc_obj_alloc;	/* Obj alloc current */ | ||||||
|  | |||||||
| @ -203,18 +203,6 @@ extern int ddi_strtoul(const char *str, char **nptr, | |||||||
| #define offsetof(s, m)  ((size_t)(&(((s *)0)->m))) | #define offsetof(s, m)  ((size_t)(&(((s *)0)->m))) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef HAVE_3ARGS_INIT_WORK |  | ||||||
| 
 |  | ||||||
| #define spl_init_work(wq,cb,d)	INIT_WORK((wq), (void *)(cb), (void *)(d)) |  | ||||||
| #define spl_get_work_data(type,field,data)	(data) |  | ||||||
| 
 |  | ||||||
| #else |  | ||||||
| 
 |  | ||||||
| #define spl_init_work(wq,cb,d)	INIT_WORK((wq), (void *)(cb)); |  | ||||||
| #define spl_get_work_data(type,field,data)	container_of(data,type,field) |  | ||||||
| 
 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #ifdef  __cplusplus | #ifdef  __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  | |||||||
| @ -35,8 +35,7 @@ | |||||||
| extern vmem_t *zio_alloc_arena;		/* arena for zio caches */ | extern vmem_t *zio_alloc_arena;		/* arena for zio caches */ | ||||||
| 
 | 
 | ||||||
| #define physmem				num_physpages | #define physmem				num_physpages | ||||||
| #define freemem				nr_free_pages() // Expensive on linux,
 | #define freemem				nr_free_pages() | ||||||
| 							// cheap on solaris
 |  | ||||||
| #define minfree				0 | #define minfree				0 | ||||||
| #define needfree			0	/* # of needed pages */ | #define needfree			0	/* # of needed pages */ | ||||||
| #define ptob(pages)			(pages * PAGE_SIZE) | #define ptob(pages)			(pages * PAGE_SIZE) | ||||||
|  | |||||||
| @ -132,10 +132,6 @@ EXPORT_SYMBOL(kmem_set_warning); | |||||||
|  * small virtual address space on 32bit arches.  This will seriously |  * small virtual address space on 32bit arches.  This will seriously | ||||||
|  * constrain the size of the slab caches and their performance. |  * constrain the size of the slab caches and their performance. | ||||||
|  * |  * | ||||||
|  * XXX: Implement work requests to keep an eye on each cache and |  | ||||||
|  *      shrink them via spl_slab_reclaim() when they are wasting lots |  | ||||||
|  *      of space.  Currently this process is driven by the reapers. |  | ||||||
|  * |  | ||||||
|  * XXX: Improve the partial slab list by carefully maintaining a |  * XXX: Improve the partial slab list by carefully maintaining a | ||||||
|  *      strict ordering of fullest to emptiest slabs based on |  *      strict ordering of fullest to emptiest slabs based on | ||||||
|  *      the slab reference count.  This gaurentees the when freeing |  *      the slab reference count.  This gaurentees the when freeing | ||||||
| @ -571,7 +567,8 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* It's important that we pack the spl_kmem_obj_t structure and the
 | /*
 | ||||||
|  |  * It's important that we pack the spl_kmem_obj_t structure and the | ||||||
|  * actual objects in to one large address space to minimize the number |  * actual objects in to one large address space to minimize the number | ||||||
|  * of calls to the allocator.  It is far better to do a few large |  * of calls to the allocator.  It is far better to do a few large | ||||||
|  * allocations and then subdivide it ourselves.  Now which allocator |  * allocations and then subdivide it ourselves.  Now which allocator | ||||||
| @ -662,14 +659,17 @@ out: | |||||||
| 	RETURN(sks); | 	RETURN(sks); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Removes slab from complete or partial list, so it must
 | /*
 | ||||||
|  * be called with the 'skc->skc_lock' held. |  * Remove a slab from complete or partial list, it must be called with | ||||||
|  |  * the 'skc->skc_lock' held but the actual free must be performed | ||||||
|  |  * outside the lock to prevent deadlocking on vmem addresses. | ||||||
|  */ |  */ | ||||||
| static void | static void | ||||||
| spl_slab_free(spl_kmem_slab_t *sks) { | spl_slab_free(spl_kmem_slab_t *sks, | ||||||
|  | 	      struct list_head *sks_list, struct list_head *sko_list) | ||||||
|  | { | ||||||
| 	spl_kmem_cache_t *skc; | 	spl_kmem_cache_t *skc; | ||||||
| 	spl_kmem_obj_t *sko, *n; | 	spl_kmem_obj_t *sko, *n; | ||||||
| 	int size; |  | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(sks->sks_magic == SKS_MAGIC); | 	ASSERT(sks->sks_magic == SKS_MAGIC); | ||||||
| @ -682,114 +682,190 @@ spl_slab_free(spl_kmem_slab_t *sks) { | |||||||
| 	skc->skc_obj_total -= sks->sks_objs; | 	skc->skc_obj_total -= sks->sks_objs; | ||||||
| 	skc->skc_slab_total--; | 	skc->skc_slab_total--; | ||||||
| 	list_del(&sks->sks_list); | 	list_del(&sks->sks_list); | ||||||
| 	size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) + |  | ||||||
| 	       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align); |  | ||||||
| 
 | 
 | ||||||
| 	/* Run destructors slab is being released */ | 	/* Run destructors slab is being released */ | ||||||
| 	list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { | 	list_for_each_entry_safe(sko, n, &sks->sks_free_list, sko_list) { | ||||||
| 		ASSERT(sko->sko_magic == SKO_MAGIC); | 		ASSERT(sko->sko_magic == SKO_MAGIC); | ||||||
|  | 		list_del(&sko->sko_list); | ||||||
| 
 | 
 | ||||||
| 		if (skc->skc_dtor) | 		if (skc->skc_dtor) | ||||||
| 			skc->skc_dtor(sko->sko_addr, skc->skc_private); | 			skc->skc_dtor(sko->sko_addr, skc->skc_private); | ||||||
| 
 | 
 | ||||||
| 		if (skc->skc_flags & KMC_OFFSLAB) | 		if (skc->skc_flags & KMC_OFFSLAB) | ||||||
| 			kv_free(skc, sko->sko_addr, size); | 			list_add(&sko->sko_list, sko_list); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	kv_free(skc, sks, skc->skc_slab_size); | 	list_add(&sks->sks_list, sks_list); | ||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | /*
 | ||||||
| __spl_slab_reclaim(spl_kmem_cache_t *skc) |  * Traverses all the partial slabs attached to a cache and free those | ||||||
|  |  * which which are currently empty, and have not been touched for | ||||||
|  |  * skc_delay seconds.  This is to avoid thrashing. | ||||||
|  |  */ | ||||||
|  | static void | ||||||
|  | spl_slab_reclaim(spl_kmem_cache_t *skc, int flag) | ||||||
| { | { | ||||||
| 	spl_kmem_slab_t *sks, *m; | 	spl_kmem_slab_t *sks, *m; | ||||||
| 	int rc = 0; | 	spl_kmem_obj_t *sko, *n; | ||||||
|  | 	LIST_HEAD(sks_list); | ||||||
|  | 	LIST_HEAD(sko_list); | ||||||
|  | 	int size; | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(spin_is_locked(&skc->skc_lock)); |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Free empty slabs which have not been touched in skc_delay | 	 * Move empty slabs and objects which have not been touched in | ||||||
| 	 * seconds.  This delay time is important to avoid thrashing. | 	 * skc_delay seconds on to private lists to be freed outside | ||||||
| 	 * Empty slabs will be at the end of the skc_partial_list. | 	 * the spin lock.  This delay time is important to avoid | ||||||
|  | 	 * thrashing however when flag is set the delay will not be | ||||||
|  | 	 * used.  Empty slabs will be at the end of the skc_partial_list. | ||||||
| 	 */ | 	 */ | ||||||
|  | 	spin_lock(&skc->skc_lock); | ||||||
|         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, |         list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, | ||||||
| 					 sks_list) { | 					 sks_list) { | ||||||
| 		if (sks->sks_ref > 0) | 		if (sks->sks_ref > 0) | ||||||
| 		       break; | 		       break; | ||||||
| 
 | 
 | ||||||
| 		if (time_after(jiffies, sks->sks_age + skc->skc_delay * HZ)) { | 		if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)) | ||||||
| 			spl_slab_free(sks); | 			spl_slab_free(sks, &sks_list, &sko_list); | ||||||
| 			rc++; |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 
 |  | ||||||
| 	/* Returns number of slabs reclaimed */ |  | ||||||
| 	RETURN(rc); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static int |  | ||||||
| spl_slab_reclaim(spl_kmem_cache_t *skc) |  | ||||||
| { |  | ||||||
| 	int rc; |  | ||||||
| 	ENTRY; |  | ||||||
| 
 |  | ||||||
| 	spin_lock(&skc->skc_lock); |  | ||||||
| 	rc = __spl_slab_reclaim(skc); |  | ||||||
| 	spin_unlock(&skc->skc_lock); | 	spin_unlock(&skc->skc_lock); | ||||||
| 
 | 
 | ||||||
| 	RETURN(rc); | 	/*
 | ||||||
|  | 	 * We only have list of spl_kmem_obj_t's if they are located off | ||||||
|  | 	 * the slab, otherwise they get feed with the spl_kmem_slab_t. | ||||||
|  | 	 */ | ||||||
|  | 	if (!list_empty(&sko_list)) { | ||||||
|  | 		ASSERT(skc->skc_flags & KMC_OFFSLAB); | ||||||
|  | 
 | ||||||
|  | 		size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) + | ||||||
|  | 		       P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align); | ||||||
|  | 
 | ||||||
|  | 		list_for_each_entry_safe(sko, n, &sko_list, sko_list) | ||||||
|  | 			kv_free(skc, sko->sko_addr, size); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	list_for_each_entry_safe(sks, m, &sks_list, sks_list) | ||||||
|  | 		kv_free(skc, sks, skc->skc_slab_size); | ||||||
|  | 
 | ||||||
|  | 	EXIT; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Size slabs properly to ensure they are not too large */ | /*
 | ||||||
|  |  * Called regularly on all caches to age objects out of the magazines | ||||||
|  |  * which have not been access in skc->skc_delay seconds.  This prevents | ||||||
|  |  * idle magazines from holding memory which might be better used by | ||||||
|  |  * other caches or parts of the system.  The delay is present to | ||||||
|  |  * prevent thrashing the magazine. | ||||||
|  |  */ | ||||||
|  | static void | ||||||
|  | spl_magazine_age(void *data) | ||||||
|  | { | ||||||
|  | 	spl_kmem_cache_t *skc = data; | ||||||
|  | 	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; | ||||||
|  | 
 | ||||||
|  | 	if (skm->skm_avail > 0 && | ||||||
|  | 	    time_after(jiffies, skm->skm_age + skc->skc_delay * HZ)) | ||||||
|  | 		(void)spl_cache_flush(skc, skm, skm->skm_refill); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Called regularly to keep a downward pressure on the size of idle | ||||||
|  |  * magazines and to release free slabs from the cache.  This function | ||||||
|  |  * never calls the registered reclaim function, that only occures | ||||||
|  |  * under memory pressure or with a direct call to spl_kmem_reap(). | ||||||
|  |  */ | ||||||
|  | static void | ||||||
|  | spl_cache_age(void *data) | ||||||
|  | { | ||||||
|  |         spl_kmem_cache_t *skc = | ||||||
|  | 		spl_get_work_data(data, spl_kmem_cache_t, skc_work.work); | ||||||
|  | 
 | ||||||
|  | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
|  | 	on_each_cpu(spl_magazine_age, skc, 0, 1); | ||||||
|  | 	spl_slab_reclaim(skc, 0); | ||||||
|  | 
 | ||||||
|  | 	if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) | ||||||
|  | 		schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Size a slab based on the size of each aliged object plus spl_kmem_obj_t. | ||||||
|  |  * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB.  However, | ||||||
|  |  * for very small objects we may end up with more than this so as not | ||||||
|  |  * to waste space in the minimal allocation of a single page.  Also for | ||||||
|  |  * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN, | ||||||
|  |  * lower than this and we will fail. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) | spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) | ||||||
| { | { | ||||||
| 	int max = ((uint64_t)1 << (MAX_ORDER - 1)) * PAGE_SIZE; | 	int sks_size, obj_size, max_size, align; | ||||||
| 	int align = skc->skc_obj_align; |  | ||||||
| 
 |  | ||||||
| 	*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; |  | ||||||
| 
 | 
 | ||||||
| 	if (skc->skc_flags & KMC_OFFSLAB) { | 	if (skc->skc_flags & KMC_OFFSLAB) { | ||||||
|  | 		*objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; | ||||||
| 		*size = sizeof(spl_kmem_slab_t); | 		*size = sizeof(spl_kmem_slab_t); | ||||||
| 	} else { | 	} else { | ||||||
| resize: | 		align = skc->skc_obj_align; | ||||||
| 		*size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align) + | 		sks_size = P2ROUNDUP(sizeof(spl_kmem_slab_t), align); | ||||||
| 			*objs * (P2ROUNDUP(skc->skc_obj_size, align) + | 		obj_size = P2ROUNDUP(skc->skc_obj_size, align) + | ||||||
| 		        P2ROUNDUP(sizeof(spl_kmem_obj_t), align)); |                            P2ROUNDUP(sizeof(spl_kmem_obj_t), align); | ||||||
| 
 | 
 | ||||||
| 		if (*size > max) | 		if (skc->skc_flags & KMC_KMEM) | ||||||
| 			GOTO(resize, *objs = *objs - 1); | 			max_size = ((uint64_t)1 << (MAX_ORDER-1)) * PAGE_SIZE; | ||||||
|  | 		else | ||||||
|  | 			max_size = (32 * 1024 * 1024); | ||||||
| 
 | 
 | ||||||
| 		ASSERT(*objs > 0); | 		for (*size = PAGE_SIZE; *size <= max_size; *size += PAGE_SIZE) { | ||||||
|  | 			*objs = (*size - sks_size) / obj_size; | ||||||
|  | 			if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB) | ||||||
|  | 				RETURN(0); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * Unable to satisfy target objets per slab, fallback to | ||||||
|  | 		 * allocating a maximally sized slab and assuming it can | ||||||
|  | 		 * contain the minimum objects count use it.  If not fail. | ||||||
|  | 		 */ | ||||||
|  | 		*size = max_size; | ||||||
|  | 		*objs = (*size - sks_size) / obj_size; | ||||||
|  | 		if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN) | ||||||
|  | 			RETURN(0); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	ASSERTF(*size <= max, "%d < %d\n", *size, max); | 	RETURN(-ENOSPC); | ||||||
| 	RETURN(0); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Make a guess at reasonable per-cpu magazine size based on the size of | ||||||
|  |  * each object and the cost of caching N of them in each magazine.  Long | ||||||
|  |  * term this should really adapt based on an observed usage heuristic. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_magazine_size(spl_kmem_cache_t *skc) | spl_magazine_size(spl_kmem_cache_t *skc) | ||||||
| { | { | ||||||
| 	int size, align = skc->skc_obj_align; | 	int size, align = skc->skc_obj_align; | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	/* Guesses for reasonable magazine sizes, they
 | 	/* Per-magazine sizes below assume a 4Kib page size */ | ||||||
| 	 * should really adapt based on observed usage. */ |  | ||||||
| 	if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256)) | 	if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 256)) | ||||||
| 		size = 4; | 		size = 4;  /* Minimum 4Mib per-magazine */ | ||||||
| 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32)) | 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE * 32)) | ||||||
| 		size = 16; | 		size = 16; /* Minimum 2Mib per-magazine */ | ||||||
| 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE)) | 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE)) | ||||||
| 		size = 64; | 		size = 64; /* Minimum 256Kib per-magazine */ | ||||||
| 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4)) | 	else if (P2ROUNDUP(skc->skc_obj_size, align) > (PAGE_SIZE / 4)) | ||||||
| 		size = 128; | 		size = 128; /* Minimum 128Kib per-magazine */ | ||||||
| 	else | 	else | ||||||
| 		size = 512; | 		size = 256; | ||||||
| 
 | 
 | ||||||
| 	RETURN(size); | 	RETURN(size); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Allocate a per-cpu magazine to assoicate with a specific core. | ||||||
|  |  */ | ||||||
| static spl_kmem_magazine_t * | static spl_kmem_magazine_t * | ||||||
| spl_magazine_alloc(spl_kmem_cache_t *skc, int node) | spl_magazine_alloc(spl_kmem_cache_t *skc, int node) | ||||||
| { | { | ||||||
| @ -798,19 +874,21 @@ spl_magazine_alloc(spl_kmem_cache_t *skc, int node) | |||||||
| 	           sizeof(void *) * skc->skc_mag_size; | 	           sizeof(void *) * skc->skc_mag_size; | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	skm = kmem_alloc_node(size, GFP_KERNEL, node); | 	skm = kmem_alloc_node(size, GFP_KERNEL | __GFP_NOFAIL, node); | ||||||
| 	if (skm) { | 	if (skm) { | ||||||
| 		skm->skm_magic = SKM_MAGIC; | 		skm->skm_magic = SKM_MAGIC; | ||||||
| 		skm->skm_avail = 0; | 		skm->skm_avail = 0; | ||||||
| 		skm->skm_size = skc->skc_mag_size; | 		skm->skm_size = skc->skc_mag_size; | ||||||
| 		skm->skm_refill = skc->skc_mag_refill; | 		skm->skm_refill = skc->skc_mag_refill; | ||||||
| 		if (!(skc->skc_flags & KMC_NOTOUCH)) | 		skm->skm_age = jiffies; | ||||||
| 			skm->skm_age = jiffies; |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	RETURN(skm); | 	RETURN(skm); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Free a per-cpu magazine assoicated with a specific core. | ||||||
|  |  */ | ||||||
| static void | static void | ||||||
| spl_magazine_free(spl_kmem_magazine_t *skm) | spl_magazine_free(spl_kmem_magazine_t *skm) | ||||||
| { | { | ||||||
| @ -825,44 +903,72 @@ spl_magazine_free(spl_kmem_magazine_t *skm) | |||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void | ||||||
|  | __spl_magazine_create(void *data) | ||||||
|  | { | ||||||
|  |         spl_kmem_cache_t *skc = data; | ||||||
|  | 	int id = smp_processor_id(); | ||||||
|  | 
 | ||||||
|  | 	skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id)); | ||||||
|  | 	ASSERT(skc->skc_mag[id]); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Create all pre-cpu magazines of reasonable sizes. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_magazine_create(spl_kmem_cache_t *skc) | spl_magazine_create(spl_kmem_cache_t *skc) | ||||||
| { | { | ||||||
| 	int i; |  | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	skc->skc_mag_size = spl_magazine_size(skc); | 	skc->skc_mag_size = spl_magazine_size(skc); | ||||||
| 	skc->skc_mag_refill = (skc->skc_mag_size + 1)  / 2; | 	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; | ||||||
| 
 | 	on_each_cpu(__spl_magazine_create, skc, 0, 1); | ||||||
| 	for_each_online_cpu(i) { |  | ||||||
| 		skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i)); |  | ||||||
| 		if (!skc->skc_mag[i]) { |  | ||||||
| 			for (i--; i >= 0; i--) |  | ||||||
| 				spl_magazine_free(skc->skc_mag[i]); |  | ||||||
| 
 |  | ||||||
| 			RETURN(-ENOMEM); |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	RETURN(0); | 	RETURN(0); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void | ||||||
|  | __spl_magazine_destroy(void *data) | ||||||
|  | { | ||||||
|  |         spl_kmem_cache_t *skc = data; | ||||||
|  | 	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; | ||||||
|  | 
 | ||||||
|  | 	(void)spl_cache_flush(skc, skm, skm->skm_avail); | ||||||
|  | 	spl_magazine_free(skm); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Destroy all pre-cpu magazines. | ||||||
|  |  */ | ||||||
| static void | static void | ||||||
| spl_magazine_destroy(spl_kmem_cache_t *skc) | spl_magazine_destroy(spl_kmem_cache_t *skc) | ||||||
| { | { | ||||||
|         spl_kmem_magazine_t *skm; |  | ||||||
| 	int i; |  | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 	on_each_cpu(__spl_magazine_destroy, skc, 0, 1); | ||||||
| 	for_each_online_cpu(i) { |  | ||||||
| 		skm = skc->skc_mag[i]; |  | ||||||
| 		(void)spl_cache_flush(skc, skm, skm->skm_avail); |  | ||||||
| 		spl_magazine_free(skm); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Create a object cache based on the following arguments: | ||||||
|  |  * name		cache name | ||||||
|  |  * size		cache object size | ||||||
|  |  * align	cache object alignment | ||||||
|  |  * ctor		cache object constructor | ||||||
|  |  * dtor		cache object destructor | ||||||
|  |  * reclaim	cache object reclaim | ||||||
|  |  * priv		cache private data for ctor/dtor/reclaim | ||||||
|  |  * vmp		unused must be NULL | ||||||
|  |  * flags | ||||||
|  |  *	KMC_NOTOUCH	Disable cache object aging (unsupported) | ||||||
|  |  *	KMC_NODEBUG	Disable debugging (unsupported) | ||||||
|  |  *	KMC_NOMAGAZINE	Disable magazine (unsupported) | ||||||
|  |  *	KMC_NOHASH      Disable hashing (unsupported) | ||||||
|  |  *	KMC_QCACHE	Disable qcache (unsupported) | ||||||
|  |  *	KMC_KMEM	Force kmem backed cache | ||||||
|  |  *	KMC_VMEM        Force vmem backed cache | ||||||
|  |  *	KMC_OFFSLAB	Locate objects off the slab | ||||||
|  |  */ | ||||||
| spl_kmem_cache_t * | spl_kmem_cache_t * | ||||||
| spl_kmem_cache_create(char *name, size_t size, size_t align, | spl_kmem_cache_create(char *name, size_t size, size_t align, | ||||||
|                       spl_kmem_ctor_t ctor, |                       spl_kmem_ctor_t ctor, | ||||||
| @ -908,6 +1014,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, | |||||||
| 	skc->skc_obj_size = size; | 	skc->skc_obj_size = size; | ||||||
| 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; | 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; | ||||||
| 	skc->skc_delay = SPL_KMEM_CACHE_DELAY; | 	skc->skc_delay = SPL_KMEM_CACHE_DELAY; | ||||||
|  | 	atomic_set(&skc->skc_ref, 0); | ||||||
| 
 | 
 | ||||||
| 	INIT_LIST_HEAD(&skc->skc_list); | 	INIT_LIST_HEAD(&skc->skc_list); | ||||||
| 	INIT_LIST_HEAD(&skc->skc_complete_list); | 	INIT_LIST_HEAD(&skc->skc_complete_list); | ||||||
| @ -947,6 +1054,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align, | |||||||
| 	if (rc) | 	if (rc) | ||||||
| 		GOTO(out, rc); | 		GOTO(out, rc); | ||||||
| 
 | 
 | ||||||
|  | 	spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc); | ||||||
|  | 	schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ); | ||||||
|  | 
 | ||||||
| 	down_write(&spl_kmem_cache_sem); | 	down_write(&spl_kmem_cache_sem); | ||||||
| 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list); | 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list); | ||||||
| 	up_write(&spl_kmem_cache_sem); | 	up_write(&spl_kmem_cache_sem); | ||||||
| @ -959,10 +1069,13 @@ out: | |||||||
| } | } | ||||||
| EXPORT_SYMBOL(spl_kmem_cache_create); | EXPORT_SYMBOL(spl_kmem_cache_create); | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Destroy a cache and all objects assoicated with the cache. | ||||||
|  |  */ | ||||||
| void | void | ||||||
| spl_kmem_cache_destroy(spl_kmem_cache_t *skc) | spl_kmem_cache_destroy(spl_kmem_cache_t *skc) | ||||||
| { | { | ||||||
|         spl_kmem_slab_t *sks, *m; | 	DECLARE_WAIT_QUEUE_HEAD(wq); | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
| @ -971,20 +1084,27 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) | |||||||
| 	list_del_init(&skc->skc_list); | 	list_del_init(&skc->skc_list); | ||||||
| 	up_write(&spl_kmem_cache_sem); | 	up_write(&spl_kmem_cache_sem); | ||||||
| 
 | 
 | ||||||
|  | 	/* Cancel any and wait for any pending delayed work */ | ||||||
|  | 	ASSERT(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); | ||||||
|  | 	cancel_delayed_work(&skc->skc_work); | ||||||
|  | 	flush_scheduled_work(); | ||||||
|  | 
 | ||||||
|  | 	/* Wait until all current callers complete, this is mainly
 | ||||||
|  | 	 * to catch the case where a low memory situation triggers a | ||||||
|  | 	 * cache reaping action which races with this destroy. */ | ||||||
|  | 	wait_event(wq, atomic_read(&skc->skc_ref) == 0); | ||||||
|  | 
 | ||||||
| 	spl_magazine_destroy(skc); | 	spl_magazine_destroy(skc); | ||||||
|  | 	spl_slab_reclaim(skc, 1); | ||||||
| 	spin_lock(&skc->skc_lock); | 	spin_lock(&skc->skc_lock); | ||||||
| 
 | 
 | ||||||
| 	/* Validate there are no objects in use and free all the
 | 	/* Validate there are no objects in use and free all the
 | ||||||
| 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ | 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ | ||||||
|  | 	ASSERT3U(skc->skc_slab_alloc, ==, 0); | ||||||
|  | 	ASSERT3U(skc->skc_obj_alloc, ==, 0); | ||||||
|  | 	ASSERT3U(skc->skc_slab_total, ==, 0); | ||||||
|  | 	ASSERT3U(skc->skc_obj_total, ==, 0); | ||||||
| 	ASSERT(list_empty(&skc->skc_complete_list)); | 	ASSERT(list_empty(&skc->skc_complete_list)); | ||||||
| 	ASSERT(skc->skc_slab_alloc == 0); |  | ||||||
| 	ASSERT(skc->skc_obj_alloc == 0); |  | ||||||
| 
 |  | ||||||
| 	list_for_each_entry_safe(sks, m, &skc->skc_partial_list, sks_list) |  | ||||||
| 		spl_slab_free(sks); |  | ||||||
| 
 |  | ||||||
| 	ASSERT(skc->skc_slab_total == 0); |  | ||||||
| 	ASSERT(skc->skc_obj_total == 0); |  | ||||||
| 
 | 
 | ||||||
| 	kmem_free(skc->skc_name, skc->skc_name_size); | 	kmem_free(skc->skc_name, skc->skc_name_size); | ||||||
| 	spin_unlock(&skc->skc_lock); | 	spin_unlock(&skc->skc_lock); | ||||||
| @ -995,6 +1115,10 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) | |||||||
| } | } | ||||||
| EXPORT_SYMBOL(spl_kmem_cache_destroy); | EXPORT_SYMBOL(spl_kmem_cache_destroy); | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Allocate an object from a slab attached to the cache.  This is used to | ||||||
|  |  * repopulate the per-cpu magazine caches in batches when they run low. | ||||||
|  |  */ | ||||||
| static void * | static void * | ||||||
| spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) | spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) | ||||||
| { | { | ||||||
| @ -1030,10 +1154,11 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) | |||||||
| 	return sko->sko_addr; | 	return sko->sko_addr; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* No available objects create a new slab.  Since this is an
 | /*
 | ||||||
|  * expensive operation we do it without holding the spinlock |  * No available objects on any slabsi, create a new slab.  Since this | ||||||
|  * and only briefly aquire it when we link in the fully |  * is an expensive operation we do it without holding the spinlock and | ||||||
|  * allocated and constructed slab. |  * only briefly aquire it when we link in the fully allocated and | ||||||
|  |  * constructed slab. | ||||||
|  */ |  */ | ||||||
| static spl_kmem_slab_t * | static spl_kmem_slab_t * | ||||||
| spl_cache_grow(spl_kmem_cache_t *skc, int flags) | spl_cache_grow(spl_kmem_cache_t *skc, int flags) | ||||||
| @ -1042,34 +1167,42 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags) | |||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
|  | 	local_irq_enable(); | ||||||
|  | 	might_sleep(); | ||||||
| 
 | 
 | ||||||
| 	if (flags & __GFP_WAIT) { | 	/*
 | ||||||
| 		flags |= __GFP_NOFAIL; | 	 * Before allocating a new slab check if the slab is being reaped. | ||||||
| 		local_irq_enable(); | 	 * If it is there is a good chance we can wait until it finishes | ||||||
| 		might_sleep(); | 	 * and then use one of the newly freed but not aged-out slabs. | ||||||
|  | 	 */ | ||||||
|  | 	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { | ||||||
|  | 		schedule(); | ||||||
|  | 		GOTO(out, sks= NULL); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	sks = spl_slab_alloc(skc, flags); | 	/* Allocate a new slab for the cache */ | ||||||
| 	if (sks == NULL) { | 	sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | __GFP_NOWARN); | ||||||
| 	        if (flags & __GFP_WAIT) | 	if (sks == NULL) | ||||||
| 			local_irq_disable(); | 		GOTO(out, sks = NULL); | ||||||
| 
 | 
 | ||||||
| 		RETURN(NULL); | 	/* Link the new empty slab in to the end of skc_partial_list. */ | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (flags & __GFP_WAIT) |  | ||||||
| 		local_irq_disable(); |  | ||||||
| 
 |  | ||||||
| 	/* Link the new empty slab in to the end of skc_partial_list */ |  | ||||||
| 	spin_lock(&skc->skc_lock); | 	spin_lock(&skc->skc_lock); | ||||||
| 	skc->skc_slab_total++; | 	skc->skc_slab_total++; | ||||||
| 	skc->skc_obj_total += sks->sks_objs; | 	skc->skc_obj_total += sks->sks_objs; | ||||||
| 	list_add_tail(&sks->sks_list, &skc->skc_partial_list); | 	list_add_tail(&sks->sks_list, &skc->skc_partial_list); | ||||||
| 	spin_unlock(&skc->skc_lock); | 	spin_unlock(&skc->skc_lock); | ||||||
|  | out: | ||||||
|  | 	local_irq_disable(); | ||||||
| 
 | 
 | ||||||
| 	RETURN(sks); | 	RETURN(sks); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Refill a per-cpu magazine with objects from the slabs for this | ||||||
|  |  * cache.  Ideally the magazine can be repopulated using existing | ||||||
|  |  * objects which have been released, however if we are unable to | ||||||
|  |  * locate enough free objects new slabs of objects will be created. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) | spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) | ||||||
| { | { | ||||||
| @ -1080,13 +1213,11 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) | |||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
| 	ASSERT(skm->skm_magic == SKM_MAGIC); | 	ASSERT(skm->skm_magic == SKM_MAGIC); | ||||||
| 
 | 
 | ||||||
| 	/* XXX: Check for refill bouncing by age perhaps */ |  | ||||||
| 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); | 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); | ||||||
| 
 |  | ||||||
| 	spin_lock(&skc->skc_lock); | 	spin_lock(&skc->skc_lock); | ||||||
| 
 | 
 | ||||||
| 	while (refill > 0) { | 	while (refill > 0) { | ||||||
| 		/* No slabs available we must grow the cache */ | 		/* No slabs available we may need to grow the cache */ | ||||||
| 		if (list_empty(&skc->skc_partial_list)) { | 		if (list_empty(&skc->skc_partial_list)) { | ||||||
| 			spin_unlock(&skc->skc_lock); | 			spin_unlock(&skc->skc_lock); | ||||||
| 
 | 
 | ||||||
| @ -1135,6 +1266,9 @@ out: | |||||||
| 	RETURN(rc); | 	RETURN(rc); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Release an object back to the slab from which it came. | ||||||
|  |  */ | ||||||
| static void | static void | ||||||
| spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) | spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) | ||||||
| { | { | ||||||
| @ -1176,6 +1310,13 @@ spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) | |||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Release a batch of objects from a per-cpu magazine back to their | ||||||
|  |  * respective slabs.  This occurs when we exceed the magazine size, | ||||||
|  |  * are under memory pressure, when the cache is idle, or during | ||||||
|  |  * cache cleanup.  The flush argument contains the number of entries | ||||||
|  |  * to remove from the magazine. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) | spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) | ||||||
| { | { | ||||||
| @ -1185,12 +1326,17 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) | |||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
| 	ASSERT(skm->skm_magic == SKM_MAGIC); | 	ASSERT(skm->skm_magic == SKM_MAGIC); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * XXX: Currently we simply return objects from the magazine to | ||||||
|  | 	 * the slabs in fifo order.  The ideal thing to do from a memory | ||||||
|  | 	 * fragmentation standpoint is to cheaply determine the set of | ||||||
|  | 	 * objects in the magazine which will result in the largest | ||||||
|  | 	 * number of free slabs if released from the magazine. | ||||||
|  | 	 */ | ||||||
| 	spin_lock(&skc->skc_lock); | 	spin_lock(&skc->skc_lock); | ||||||
| 
 |  | ||||||
| 	for (i = 0; i < count; i++) | 	for (i = 0; i < count; i++) | ||||||
| 		spl_cache_shrink(skc, skm->skm_objs[i]); | 		spl_cache_shrink(skc, skm->skm_objs[i]); | ||||||
| 
 | 
 | ||||||
| //	__spl_slab_reclaim(skc);
 |  | ||||||
| 	skm->skm_avail -= count; | 	skm->skm_avail -= count; | ||||||
| 	memmove(skm->skm_objs, &(skm->skm_objs[count]), | 	memmove(skm->skm_objs, &(skm->skm_objs[count]), | ||||||
| 	        sizeof(void *) * skm->skm_avail); | 	        sizeof(void *) * skm->skm_avail); | ||||||
| @ -1200,6 +1346,10 @@ spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) | |||||||
| 	RETURN(count); | 	RETURN(count); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Allocate an object from the per-cpu magazine, or if the magazine | ||||||
|  |  * is empty directly allocate from a slab and repopulate the magazine. | ||||||
|  |  */ | ||||||
| void * | void * | ||||||
| spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) | spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) | ||||||
| { | { | ||||||
| @ -1209,7 +1359,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) | |||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
| 	ASSERT(flags & KM_SLEEP); /* XXX: KM_NOSLEEP not yet supported */ | 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); | ||||||
|  | 	ASSERT(flags & KM_SLEEP); | ||||||
|  | 	atomic_inc(&skc->skc_ref); | ||||||
| 	local_irq_save(irq_flags); | 	local_irq_save(irq_flags); | ||||||
| 
 | 
 | ||||||
| restart: | restart: | ||||||
| @ -1225,8 +1377,7 @@ restart: | |||||||
| 	if (likely(skm->skm_avail)) { | 	if (likely(skm->skm_avail)) { | ||||||
| 		/* Object available in CPU cache, use it */ | 		/* Object available in CPU cache, use it */ | ||||||
| 		obj = skm->skm_objs[--skm->skm_avail]; | 		obj = skm->skm_objs[--skm->skm_avail]; | ||||||
| 		if (!(skc->skc_flags & KMC_NOTOUCH)) | 		skm->skm_age = jiffies; | ||||||
| 			skm->skm_age = jiffies; |  | ||||||
| 	} else { | 	} else { | ||||||
| 		/* Per-CPU cache empty, directly allocate from
 | 		/* Per-CPU cache empty, directly allocate from
 | ||||||
| 		 * the slab and refill the per-CPU cache. */ | 		 * the slab and refill the per-CPU cache. */ | ||||||
| @ -1240,11 +1391,18 @@ restart: | |||||||
| 
 | 
 | ||||||
| 	/* Pre-emptively migrate object to CPU L1 cache */ | 	/* Pre-emptively migrate object to CPU L1 cache */ | ||||||
| 	prefetchw(obj); | 	prefetchw(obj); | ||||||
|  | 	atomic_dec(&skc->skc_ref); | ||||||
| 
 | 
 | ||||||
| 	RETURN(obj); | 	RETURN(obj); | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(spl_kmem_cache_alloc); | EXPORT_SYMBOL(spl_kmem_cache_alloc); | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Free an object back to the local per-cpu magazine, there is no | ||||||
|  |  * guarantee that this is the same magazine the object was originally | ||||||
|  |  * allocated from.  We may need to flush entire from the magazine | ||||||
|  |  * back to the slabs to make space. | ||||||
|  |  */ | ||||||
| void | void | ||||||
| spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) | spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) | ||||||
| { | { | ||||||
| @ -1253,6 +1411,8 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) | |||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
|  | 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); | ||||||
|  | 	atomic_inc(&skc->skc_ref); | ||||||
| 	local_irq_save(flags); | 	local_irq_save(flags); | ||||||
| 
 | 
 | ||||||
| 	/* Safe to update per-cpu structure without lock, but
 | 	/* Safe to update per-cpu structure without lock, but
 | ||||||
| @ -1270,62 +1430,87 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) | |||||||
| 	skm->skm_objs[skm->skm_avail++] = obj; | 	skm->skm_objs[skm->skm_avail++] = obj; | ||||||
| 
 | 
 | ||||||
| 	local_irq_restore(flags); | 	local_irq_restore(flags); | ||||||
|  | 	atomic_dec(&skc->skc_ref); | ||||||
| 
 | 
 | ||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(spl_kmem_cache_free); | EXPORT_SYMBOL(spl_kmem_cache_free); | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * The generic shrinker function for all caches.  Under linux a shrinker | ||||||
|  |  * may not be tightly coupled with a slab cache.  In fact linux always | ||||||
|  |  * systematically trys calling all registered shrinker callbacks which | ||||||
|  |  * report that they contain unused objects.  Because of this we only | ||||||
|  |  * register one shrinker function in the shim layer for all slab caches. | ||||||
|  |  * We always attempt to shrink all caches when this generic shrinker | ||||||
|  |  * is called.  The shrinker should return the number of free objects | ||||||
|  |  * in the cache when called with nr_to_scan == 0 but not attempt to | ||||||
|  |  * free any objects.  When nr_to_scan > 0 it is a request that nr_to_scan | ||||||
|  |  * objects should be freed, because Solaris semantics are to free | ||||||
|  |  * all available objects we may free more objects than requested. | ||||||
|  |  */ | ||||||
| static int | static int | ||||||
| spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) | spl_kmem_cache_generic_shrinker(int nr_to_scan, unsigned int gfp_mask) | ||||||
| { | { | ||||||
| 	spl_kmem_cache_t *skc; | 	spl_kmem_cache_t *skc; | ||||||
|  | 	int unused = 0; | ||||||
| 
 | 
 | ||||||
| 	/* Under linux a shrinker is not tightly coupled with a slab
 |  | ||||||
| 	 * cache.  In fact linux always systematically trys calling all |  | ||||||
| 	 * registered shrinker callbacks until its target reclamation level |  | ||||||
| 	 * is reached.  Because of this we only register one shrinker |  | ||||||
| 	 * function in the shim layer for all slab caches.  And we always |  | ||||||
| 	 * attempt to shrink all caches when this generic shrinker is called. |  | ||||||
| 	 */ |  | ||||||
| 	down_read(&spl_kmem_cache_sem); | 	down_read(&spl_kmem_cache_sem); | ||||||
|  | 	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { | ||||||
|  | 		if (nr_to_scan) | ||||||
|  | 			spl_kmem_cache_reap_now(skc); | ||||||
| 
 | 
 | ||||||
| 	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) | 		/*
 | ||||||
| 		spl_kmem_cache_reap_now(skc); | 		 * Presume everything alloc'ed in reclaimable, this ensures | ||||||
| 
 | 		 * we are called again with nr_to_scan > 0 so can try and | ||||||
|  | 		 * reclaim.  The exact number is not important either so | ||||||
|  | 		 * we forgo taking this already highly contented lock. | ||||||
|  | 		 */ | ||||||
|  | 		unused += skc->skc_obj_alloc; | ||||||
|  | 	} | ||||||
| 	up_read(&spl_kmem_cache_sem); | 	up_read(&spl_kmem_cache_sem); | ||||||
| 
 | 
 | ||||||
| 	/* XXX: Under linux we should return the remaining number of
 | 	return (unused * sysctl_vfs_cache_pressure) / 100; | ||||||
| 	 * entries in the cache.  We should do this as well. |  | ||||||
| 	 */ |  | ||||||
| 	return 1; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Call the registered reclaim function for a cache.  Depending on how | ||||||
|  |  * many and which objects are released it may simply repopulate the | ||||||
|  |  * local magazine which will then need to age-out.  Objects which cannot | ||||||
|  |  * fit in the magazine we will be released back to their slabs which will | ||||||
|  |  * also need to age out before being release.  This is all just best | ||||||
|  |  * effort and we do not want to thrash creating and destroying slabs. | ||||||
|  |  */ | ||||||
| void | void | ||||||
| spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) | spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) | ||||||
| { | { | ||||||
| 	spl_kmem_magazine_t *skm; |  | ||||||
| 	int i; |  | ||||||
| 	ENTRY; | 	ENTRY; | ||||||
| 
 | 
 | ||||||
| 	ASSERT(skc->skc_magic == SKC_MAGIC); | 	ASSERT(skc->skc_magic == SKC_MAGIC); | ||||||
|  | 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); | ||||||
|  | 
 | ||||||
|  | 	/* Prevent concurrent cache reaping when contended */ | ||||||
|  | 	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) { | ||||||
|  | 		EXIT; | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	atomic_inc(&skc->skc_ref); | ||||||
| 
 | 
 | ||||||
| 	if (skc->skc_reclaim) | 	if (skc->skc_reclaim) | ||||||
| 		skc->skc_reclaim(skc->skc_private); | 		skc->skc_reclaim(skc->skc_private); | ||||||
| 
 | 
 | ||||||
| 	/* Ensure per-CPU caches which are idle gradually flush */ | 	spl_slab_reclaim(skc, 0); | ||||||
| 	for_each_online_cpu(i) { | 	clear_bit(KMC_BIT_REAPING, &skc->skc_flags); | ||||||
| 		skm = skc->skc_mag[i]; | 	atomic_dec(&skc->skc_ref); | ||||||
| 
 |  | ||||||
| 		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ)) |  | ||||||
| 			(void)spl_cache_flush(skc, skm, skm->skm_refill); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	spl_slab_reclaim(skc); |  | ||||||
| 
 | 
 | ||||||
| 	EXIT; | 	EXIT; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(spl_kmem_cache_reap_now); | EXPORT_SYMBOL(spl_kmem_cache_reap_now); | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Reap all free slabs from all registered caches. | ||||||
|  |  */ | ||||||
| void | void | ||||||
| spl_kmem_reap(void) | spl_kmem_reap(void) | ||||||
| { | { | ||||||
|  | |||||||
| @ -40,6 +40,7 @@ | |||||||
| #include <linux/module.h> | #include <linux/module.h> | ||||||
| #include <linux/device.h> | #include <linux/device.h> | ||||||
| #include <linux/list.h> | #include <linux/list.h> | ||||||
|  | #include <linux/swap.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/ioctls.h> | #include <asm/ioctls.h> | ||||||
| #include <asm/uaccess.h> | #include <asm/uaccess.h> | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Brian Behlendorf
						Brian Behlendorf