Merge branch 'kmem-rework'

The core motivation behind these changes is to minimize the memory management differences between ZFS on Linux and other platforms. This simplifies the process of porting changes to Linux from other platforms. This is good for code quality and is expected to reduce the number of defects accidentally introduced due to porting. The key reason this is now possible is due to the addition of Linux features such as the thread-specific PF_FSTRANS bit which was introduced for XFS. This patch stack also performs some refactoring and cleanup designed to make the code more maintainable and understandable. Finally, in the context of making and testing these changes several bugs were identified and resolved resulting in a more robust implementation. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Tim Chase <tim@chase2k.com> Closes #414
2024-11-17 18:11:00 +03:00 · 2015-01-16 13:59:18 -08:00 · 2015-01-16 13:59:18 -08:00 · 9099312977
commit 9099312977
parent 47af4b76ff ee33517452
24 changed files with 3058 additions and 2685 deletions
--- a/include/linux/proc_compat.h
+++ b/include/linux/proc_compat.h
@ -22,8 +22,8 @@
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 \*****************************************************************************/

-#ifndef _SPL_PROC_H
-#define _SPL_PROC_H
+#ifndef _SPL_PROC_COMPAT_H
+#define _SPL_PROC_COMPAT_H

 #include <linux/proc_fs.h>

@ -32,4 +32,4 @@ extern struct proc_dir_entry *proc_spl_kstat;
 int spl_proc_init(void);
 void spl_proc_fini(void);

-#endif /* SPL_PROC_H */
+#endif /* SPL_PROC_COMPAT_H */
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@ -44,6 +44,7 @@ KERNEL_H = \
 	$(top_srcdir)/include/sys/isa_defs.h \
 	$(top_srcdir)/include/sys/kidmap.h \
 	$(top_srcdir)/include/sys/kmem.h \
+	$(top_srcdir)/include/sys/kmem_cache.h \
 	$(top_srcdir)/include/sys/kobj.h \
 	$(top_srcdir)/include/sys/kstat.h \
 	$(top_srcdir)/include/sys/list.h \
@ -94,6 +95,7 @@ KERNEL_H = \
 	$(top_srcdir)/include/sys/varargs.h \
 	$(top_srcdir)/include/sys/vfs.h \
 	$(top_srcdir)/include/sys/vfs_opreg.h \
+	$(top_srcdir)/include/sys/vmem.h \
 	$(top_srcdir)/include/sys/vmsystm.h \
 	$(top_srcdir)/include/sys/vnode.h \
 	$(top_srcdir)/include/sys/zmod.h \
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@ -1,4 +1,4 @@
-/*****************************************************************************\
+/*
 *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
@ -20,298 +20,14 @@
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
-\*****************************************************************************/
+ */

 #ifndef _SPL_KMEM_H
 #define	_SPL_KMEM_H

-#include <linux/module.h>
+#include <sys/debug.h>
 #include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-#include <linux/hash.h>
-#include <linux/rbtree.h>
-#include <linux/ctype.h>
-#include <asm/atomic.h>
-#include <sys/types.h>
-#include <sys/vmsystm.h>
-#include <sys/kstat.h>
-#include <sys/taskq.h>
-
-/*
- * Memory allocation interfaces
- */
-#define KM_SLEEP	GFP_KERNEL	/* Can sleep, never fails */
-#define KM_NOSLEEP	GFP_ATOMIC	/* Can not sleep, may fail */
-#define KM_PUSHPAGE	(GFP_NOIO | __GFP_HIGH)	/* Use reserved memory */
-#define KM_NODEBUG	__GFP_NOWARN	/* Suppress warnings */
-#define KM_FLAGS	__GFP_BITS_MASK
-#define KM_VMFLAGS	GFP_LEVEL_MASK
-
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-# define __GFP_ZERO                     0x8000
-#endif
-
-/*
- * PF_NOFS is a per-process debug flag which is set in current->flags to
- * detect when a process is performing an unsafe allocation.  All tasks
- * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because
- * if they enter direct reclaim and initiate I/O the may deadlock.
- *
- * When debugging is disabled, any incorrect usage will be detected and
- * a call stack with warning will be printed to the console.  The flags
- * will then be automatically corrected to allow for safe execution.  If
- * debugging is enabled this will be treated as a fatal condition.
- *
- * To avoid any risk of conflicting with the existing PF_ flags.  The
- * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit.  Only when
- * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused,
- * will the PF_NOFS bit be valid.  Happily, most existing distributions
- * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled.
- */
-#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER)
-#define	PF_NOFS	PF_MUTEX_TESTER
-
-static inline void
-sanitize_flags(struct task_struct *p, gfp_t *flags)
-{
-	if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) {
-#ifdef NDEBUG
-		printk(KERN_WARNING "Fixing allocation for task %s (%d) "
-		    "which used GFP flags 0x%x with PF_NOFS set\n",
-		    p->comm, p->pid, *flags);
-		spl_dumpstack();
-		*flags &= ~(__GFP_IO|__GFP_FS);
-#else
-		PANIC("FATAL allocation for task %s (%d) which used GFP "
-		    "flags 0x%x with PF_NOFS set\n", p->comm, p->pid, *flags);
-#endif /* NDEBUG */
-	}
-}
-#else
-#define PF_NOFS			0x00000000
-#define sanitize_flags(p, fl)	((void)0)
-#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */
-
-/*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32.  To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
- */
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	do {
-		ptr = kmalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return ptr;
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	do {
-		ptr = kzalloc(size, flags);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return ptr;
-}
-
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	do {
-		ptr = kmalloc_node(size, flags, node);
-	} while (ptr == NULL && (flags & __GFP_WAIT));
-
-	return ptr;
-}
-
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	sanitize_flags(current, &flags);
-
-	/*
-	 * Retry failed __vmalloc() allocations once every second.  The
-	 * rational for the delay is that the likely failure modes are:
-	 *
-	 * 1) The system has completely exhausted memory, in which case
-	 *    delaying 1 second for the memory reclaim to run is reasonable
-	 *    to avoid thrashing the system.
-	 * 2) The system has memory but has exhausted the small virtual
-	 *    address space available on 32-bit systems.  Retrying the
-	 *    allocation immediately will only result in spinning on the
-	 *    virtual address space lock.  It is better delay a second and
-	 *    hope that another process will free some of the address space.
-	 *    But the bottom line is there is not much we can actually do
-	 *    since we can never safely return a failure and honor the
-	 *    Solaris semantics.
-	 */
-	while (1) {
-		ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
-		if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-		} else {
-			break;
-		}
-	}
-
-	return ptr;
-}
-
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
-	void *ptr;
-
-	ptr = vmalloc_nofail(size, flags);
-	if (ptr)
-		memset(ptr, 0, (size));
-
-	return ptr;
-}
-
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
-# ifdef HAVE_ATOMIC64_T
-
-# define kmem_alloc_used_add(size)      atomic64_add(size, &kmem_alloc_used)
-# define kmem_alloc_used_sub(size)      atomic64_sub(size, &kmem_alloc_used)
-# define kmem_alloc_used_read()         atomic64_read(&kmem_alloc_used)
-# define kmem_alloc_used_set(size)      atomic64_set(&kmem_alloc_used, size)
-# define vmem_alloc_used_add(size)      atomic64_add(size, &vmem_alloc_used)
-# define vmem_alloc_used_sub(size)      atomic64_sub(size, &vmem_alloc_used)
-# define vmem_alloc_used_read()         atomic64_read(&vmem_alloc_used)
-# define vmem_alloc_used_set(size)      atomic64_set(&vmem_alloc_used, size)
-
-extern atomic64_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-# else  /* HAVE_ATOMIC64_T */
-
-# define kmem_alloc_used_add(size)      atomic_add(size, &kmem_alloc_used)
-# define kmem_alloc_used_sub(size)      atomic_sub(size, &kmem_alloc_used)
-# define kmem_alloc_used_read()         atomic_read(&kmem_alloc_used)
-# define kmem_alloc_used_set(size)      atomic_set(&kmem_alloc_used, size)
-# define vmem_alloc_used_add(size)      atomic_add(size, &vmem_alloc_used)
-# define vmem_alloc_used_sub(size)      atomic_sub(size, &vmem_alloc_used)
-# define vmem_alloc_used_read()         atomic_read(&vmem_alloc_used)
-# define vmem_alloc_used_set(size)      atomic_set(&vmem_alloc_used, size)
-
-extern atomic_t kmem_alloc_used;
-extern unsigned long long kmem_alloc_max;
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-# endif /* HAVE_ATOMIC64_T */
-
-# ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging.  All memory will be accounted
- * for and each allocation will be explicitly tracked.  Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported.  This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging.  This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#  define kmem_alloc(sz, fl)            kmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_zalloc(sz, fl)           kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 1, nd)
-#  define kmem_free(ptr, sz)            kmem_free_track((ptr), (sz))
-
-#  define vmem_alloc(sz, fl)            vmem_alloc_track((sz), (fl),           \
-                                             __FUNCTION__, __LINE__)
-#  define vmem_zalloc(sz, fl)           vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__)
-#  define vmem_free(ptr, sz)            vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
-
-# else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM.  This provides basic memory
- * accounting with little to no impact on performance.  When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console.  To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#  define kmem_alloc(sz, fl)            kmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_zalloc(sz, fl)           kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__, 0, 0)
-#  define kmem_alloc_node(sz, fl, nd)   kmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__, 1, nd)
-#  define kmem_free(ptr, sz)            kmem_free_debug((ptr), (sz))
-
-#  define vmem_alloc(sz, fl)            vmem_alloc_debug((sz), (fl),           \
-                                             __FUNCTION__, __LINE__)
-#  define vmem_zalloc(sz, fl)           vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
-                                             __FUNCTION__, __LINE__)
-#  define vmem_free(ptr, sz)            vmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
-
-# endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled.  There will be no overhead even for
- * minimal memory accounting.  To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-# define kmem_alloc(sz, fl)             kmalloc_nofail((sz), (fl))
-# define kmem_zalloc(sz, fl)            kzalloc_nofail((sz), (fl))
-# define kmem_alloc_node(sz, fl, nd)    kmalloc_node_nofail((sz), (fl), (nd))
-# define kmem_free(ptr, sz)             ((void)(sz), kfree(ptr))
-
-# define vmem_alloc(sz, fl)             vmalloc_nofail((sz), (fl))
-# define vmem_zalloc(sz, fl)            vzalloc_nofail((sz), (fl))
-# define vmem_free(ptr, sz)             ((void)(sz), vfree(ptr))
-
-#endif /* DEBUG_KMEM */
+#include <linux/sched.h>

 extern int kmem_debugging(void);
 extern char *kmem_vasprintf(const char *fmt, va_list ap);
@ -319,218 +35,116 @@ extern char *kmem_asprintf(const char *fmt, ...);
 extern char *strdup(const char *str);
 extern void strfree(char *str);

+/*
+ * Memory allocation interfaces
+ */
+#define	KM_SLEEP	0x0000	/* can block for memory; success guaranteed */
+#define	KM_NOSLEEP	0x0001	/* cannot block for memory; may fail */
+#define	KM_PUSHPAGE	0x0004	/* can block for memory; may use reserve */
+#define	KM_ZERO		0x1000	/* zero the allocation */
+#define	KM_VMEM		0x2000	/* caller is vmem_* wrapper */
+
+#define	KM_PUBLIC_MASK	(KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)

 /*
- * Slab allocation interfaces.  The SPL slab differs from the standard
- * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
- * allocated from the physical or virtal memory address space.  The virtual
- * slabs allow for good behavior when allocation large objects of identical
- * size.  This slab implementation also supports both constructors and
- * destructions which the Linux slab does not.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart.  The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
 */
-enum {
-	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */
-	KMC_BIT_NODEBUG		= 1,	/* Default behavior */
-	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */
-	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */
-	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */
-	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
-	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
-	KMC_BIT_SLAB		= 7,	/* Use Linux slab cache */
-	KMC_BIT_OFFSLAB		= 8,	/* Objects not on slab */
-	KMC_BIT_NOEMERGENCY	= 9,	/* Disable emergency objects */
-	KMC_BIT_DEADLOCKED      = 14,	/* Deadlock detected */
-	KMC_BIT_GROWING         = 15,   /* Growing in progress */
-	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
-	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
-	KMC_BIT_TOTAL		= 18,	/* Proc handler helper bit */
-	KMC_BIT_ALLOC		= 19,	/* Proc handler helper bit */
-	KMC_BIT_MAX		= 20,	/* Proc handler helper bit */
-};
-
-/* kmem move callback return values */
-typedef enum kmem_cbrc {
-	KMEM_CBRC_YES		= 0,	/* Object moved */
-	KMEM_CBRC_NO		= 1,	/* Object not moved */
-	KMEM_CBRC_LATER		= 2,	/* Object not moved, try again later */
-	KMEM_CBRC_DONT_NEED	= 3,	/* Neither object is needed */
-	KMEM_CBRC_DONT_KNOW	= 4,	/* Object unknown */
-} kmem_cbrc_t;
-
-#define KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH)
-#define KMC_NODEBUG		(1 << KMC_BIT_NODEBUG)
-#define KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE)
-#define KMC_NOHASH		(1 << KMC_BIT_NOHASH)
-#define KMC_QCACHE		(1 << KMC_BIT_QCACHE)
-#define KMC_KMEM		(1 << KMC_BIT_KMEM)
-#define KMC_VMEM		(1 << KMC_BIT_VMEM)
-#define KMC_SLAB		(1 << KMC_BIT_SLAB)
-#define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
-#define KMC_NOEMERGENCY		(1 << KMC_BIT_NOEMERGENCY)
-#define KMC_DEADLOCKED		(1 << KMC_BIT_DEADLOCKED)
-#define KMC_GROWING		(1 << KMC_BIT_GROWING)
-#define KMC_REAPING		(1 << KMC_BIT_REAPING)
-#define KMC_DESTROY		(1 << KMC_BIT_DESTROY)
-#define KMC_TOTAL		(1 << KMC_BIT_TOTAL)
-#define KMC_ALLOC		(1 << KMC_BIT_ALLOC)
-#define KMC_MAX			(1 << KMC_BIT_MAX)
-
-#define KMC_REAP_CHUNK		INT_MAX
-#define KMC_DEFAULT_SEEKS	1
-
-#define KMC_EXPIRE_AGE		0x1     /* Due to age */
-#define KMC_EXPIRE_MEM		0x2     /* Due to low memory */
-
-#define	KMC_RECLAIM_ONCE	0x1	/* Force a single shrinker pass */
-
-extern unsigned int spl_kmem_cache_expire;
-extern struct list_head spl_kmem_cache_list;
-extern struct rw_semaphore spl_kmem_cache_sem;
-
-#define SKM_MAGIC			0x2e2e2e2e
-#define SKO_MAGIC			0x20202020
-#define SKS_MAGIC			0x22222222
-#define SKC_MAGIC			0x2c2c2c2c
-
-#define SPL_KMEM_CACHE_DELAY		15	/* Minimum slab release age */
-#define SPL_KMEM_CACHE_REAP		0	/* Default reap everything */
-#define SPL_KMEM_CACHE_OBJ_PER_SLAB	16	/* Target objects per slab */
-#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	1	/* Minimum objects per slab */
-#define SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */
-
-#define POINTER_IS_VALID(p)		0	/* Unimplemented */
-#define POINTER_INVALIDATE(pp)			/* Unimplemented */
-
-typedef int (*spl_kmem_ctor_t)(void *, void *, int);
-typedef void (*spl_kmem_dtor_t)(void *, void *);
-typedef void (*spl_kmem_reclaim_t)(void *);
-
-typedef struct spl_kmem_magazine {
-	uint32_t		skm_magic;	/* Sanity magic */
-	uint32_t		skm_avail;	/* Available objects */
-	uint32_t		skm_size;	/* Magazine size */
-	uint32_t		skm_refill;	/* Batch refill size */
-	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
-	unsigned long		skm_age;	/* Last cache access */
-	unsigned int		skm_cpu;	/* Owned by cpu */
-	void			*skm_objs[0];	/* Object pointers */
-} spl_kmem_magazine_t;
-
-typedef struct spl_kmem_obj {
-        uint32_t		sko_magic;	/* Sanity magic */
-	void			*sko_addr;	/* Buffer address */
-	struct spl_kmem_slab	*sko_slab;	/* Owned by slab */
-	struct list_head	sko_list;	/* Free object list linkage */
-} spl_kmem_obj_t;
-
-typedef struct spl_kmem_slab {
-        uint32_t		sks_magic;	/* Sanity magic */
-	uint32_t		sks_objs;	/* Objects per slab */
-	struct spl_kmem_cache	*sks_cache;	/* Owned by cache */
-	struct list_head	sks_list;	/* Slab list linkage */
-	struct list_head	sks_free_list;	/* Free object list */
-	unsigned long		sks_age;	/* Last modify jiffie */
-	uint32_t		sks_ref;	/* Ref count used objects */
-} spl_kmem_slab_t;
-
-typedef struct spl_kmem_alloc {
-	struct spl_kmem_cache	*ska_cache;	/* Owned by cache */
-	int			ska_flags;	/* Allocation flags */
-	taskq_ent_t		ska_tqe;	/* Task queue entry */
-} spl_kmem_alloc_t;
-
-typedef struct spl_kmem_emergency {
-	struct rb_node		ske_node;	/* Emergency tree linkage */
-	void			*ske_obj;	/* Buffer address */
-} spl_kmem_emergency_t;
-
-typedef struct spl_kmem_cache {
-	uint32_t		skc_magic;	/* Sanity magic */
-	uint32_t		skc_name_size;	/* Name length */
-	char			*skc_name;	/* Name string */
-	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
-	uint32_t		skc_mag_size;	/* Magazine size */
-	uint32_t		skc_mag_refill;	/* Magazine refill count */
-	spl_kmem_ctor_t		skc_ctor;	/* Constructor */
-	spl_kmem_dtor_t		skc_dtor;	/* Destructor */
-	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */
-	void			*skc_private;	/* Private data */
-	void			*skc_vmp;	/* Unused */
-	struct kmem_cache	*skc_linux_cache; /* Linux slab cache if used */
-	unsigned long		skc_flags;	/* Flags */
-	uint32_t		skc_obj_size;	/* Object size */
-	uint32_t		skc_obj_align;	/* Object alignment */
-	uint32_t		skc_slab_objs;	/* Objects per slab */
-	uint32_t		skc_slab_size;	/* Slab size */
-	uint32_t		skc_delay;	/* Slab reclaim interval */
-	uint32_t		skc_reap;	/* Slab reclaim count */
-	atomic_t		skc_ref;	/* Ref count callers */
-	taskqid_t		skc_taskqid;	/* Slab reclaim task */
-	struct list_head	skc_list;	/* List of caches linkage */
-	struct list_head	skc_complete_list;/* Completely alloc'ed */
-	struct list_head	skc_partial_list; /* Partially alloc'ed */
-	struct rb_root		skc_emergency_tree; /* Min sized objects */
-	spinlock_t		skc_lock;	/* Cache lock */
-	wait_queue_head_t	skc_waitq;	/* Allocation waiters */
-	uint64_t		skc_slab_fail;	/* Slab alloc failures */
-	uint64_t		skc_slab_create;/* Slab creates */
-	uint64_t		skc_slab_destroy;/* Slab destroys */
-	uint64_t		skc_slab_total;	/* Slab total current */
-	uint64_t		skc_slab_alloc;	/* Slab alloc current */
-	uint64_t		skc_slab_max;	/* Slab max historic  */
-	uint64_t		skc_obj_total;	/* Obj total current */
-	uint64_t		skc_obj_alloc;	/* Obj alloc current */
-	uint64_t		skc_obj_max;	/* Obj max historic */
-	uint64_t		skc_obj_deadlock;  /* Obj emergency deadlocks */
-	uint64_t		skc_obj_emergency; /* Obj emergency current */
-	uint64_t		skc_obj_emergency_max; /* Obj emergency max */
-} spl_kmem_cache_t;
-#define kmem_cache_t		spl_kmem_cache_t
-
-extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
-	size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
-	spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
-extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
-	kmem_cbrc_t (*)(void *, void *, size_t, void *));
-extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
-extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
-extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
-extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
-extern void spl_kmem_reap(void);
-
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
-
-#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \
-        spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags)
-#define kmem_cache_set_move(skc, move)	spl_kmem_cache_set_move(skc, move)
-#define kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
-#define kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
-#define kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
-#define kmem_cache_reap_now(skc)	\
-        spl_kmem_cache_reap_now(skc, skc->skc_reap)
-#define kmem_reap()			spl_kmem_reap()
-#define kmem_virt(ptr)			(((ptr) >= (void *)VMALLOC_START) && \
-					 ((ptr) <  (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted.  This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
 {
-	if (skc->skc_linux_cache == NULL)
-		return;
+	gfp_t lflags = __GFP_NOWARN | __GFP_COMP;

-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
-	skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
-	skc->skc_linux_cache->gfpflags |= flags;
-#endif
+	if (flags & KM_NOSLEEP) {
+		lflags |= GFP_ATOMIC | __GFP_NORETRY;
+	} else {
+		lflags |= GFP_KERNEL;
+		if ((current->flags & PF_FSTRANS))
+			lflags &= ~(__GFP_IO|__GFP_FS);
+	}
+
+	if (flags & KM_PUSHPAGE)
+		lflags |= __GFP_HIGH;
+
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
+
+	return (lflags);
 }

+typedef struct {
+	struct task_struct *fstrans_thread;
+	unsigned int saved_flags;
+} fstrans_cookie_t;
+
+static inline fstrans_cookie_t
+spl_fstrans_mark(void)
+{
+	fstrans_cookie_t cookie;
+
+	cookie.fstrans_thread = current;
+	cookie.saved_flags = current->flags & PF_FSTRANS;
+	current->flags |= PF_FSTRANS;
+
+	return (cookie);
+}
+
+static inline void
+spl_fstrans_unmark(fstrans_cookie_t cookie)
+{
+	ASSERT3P(cookie.fstrans_thread, ==, current);
+	ASSERT(current->flags & PF_FSTRANS);
+
+	current->flags &= ~(PF_FSTRANS);
+	current->flags |= cookie.saved_flags;
+}
+
+static inline int
+spl_fstrans_check(void)
+{
+	return (current->flags & PF_FSTRANS);
+}
+
+#ifdef HAVE_ATOMIC64_T
+#define	kmem_alloc_used_add(size)	atomic64_add(size, &kmem_alloc_used)
+#define	kmem_alloc_used_sub(size)	atomic64_sub(size, &kmem_alloc_used)
+#define	kmem_alloc_used_read()		atomic64_read(&kmem_alloc_used)
+#define	kmem_alloc_used_set(size)	atomic64_set(&kmem_alloc_used, size)
+extern atomic64_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#else  /* HAVE_ATOMIC64_T */
+#define	kmem_alloc_used_add(size)	atomic_add(size, &kmem_alloc_used)
+#define	kmem_alloc_used_sub(size)	atomic_sub(size, &kmem_alloc_used)
+#define	kmem_alloc_used_read()		atomic_read(&kmem_alloc_used)
+#define	kmem_alloc_used_set(size)	atomic_set(&kmem_alloc_used, size)
+extern atomic_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#endif /* HAVE_ATOMIC64_T */
+
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
+
+#define	kmem_alloc(sz, fl)	spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define	kmem_zalloc(sz, fl)	spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	kmem_free(ptr, sz)	spl_kmem_free((ptr), (sz))
+
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
+
 #endif	/* _SPL_KMEM_H */
--- a/include/sys/kmem_cache.h
+++ b/include/sys/kmem_cache.h
@ -0,0 +1,240 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KMEM_CACHE_H
+#define	_SPL_KMEM_CACHE_H
+
+#include <sys/taskq.h>
+
+/*
+ * Slab allocation interfaces.  The SPL slab differs from the standard
+ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
+ * allocated from the physical or virtal memory address space.  The virtual
+ * slabs allow for good behavior when allocation large objects of identical
+ * size.  This slab implementation also supports both constructors and
+ * destructors which the Linux slab does not.
+ */
+enum {
+	KMC_BIT_NOTOUCH		= 0,	/* Don't update ages */
+	KMC_BIT_NODEBUG		= 1,	/* Default behavior */
+	KMC_BIT_NOMAGAZINE	= 2,	/* XXX: Unsupported */
+	KMC_BIT_NOHASH		= 3,	/* XXX: Unsupported */
+	KMC_BIT_QCACHE		= 4,	/* XXX: Unsupported */
+	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
+	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
+	KMC_BIT_SLAB		= 7,	/* Use Linux slab cache */
+	KMC_BIT_OFFSLAB		= 8,	/* Objects not on slab */
+	KMC_BIT_NOEMERGENCY	= 9,	/* Disable emergency objects */
+	KMC_BIT_DEADLOCKED	= 14,	/* Deadlock detected */
+	KMC_BIT_GROWING		= 15,	/* Growing in progress */
+	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
+	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
+	KMC_BIT_TOTAL		= 18,	/* Proc handler helper bit */
+	KMC_BIT_ALLOC		= 19,	/* Proc handler helper bit */
+	KMC_BIT_MAX		= 20,	/* Proc handler helper bit */
+};
+
+/* kmem move callback return values */
+typedef enum kmem_cbrc {
+	KMEM_CBRC_YES		= 0,	/* Object moved */
+	KMEM_CBRC_NO		= 1,	/* Object not moved */
+	KMEM_CBRC_LATER		= 2,	/* Object not moved, try again later */
+	KMEM_CBRC_DONT_NEED	= 3,	/* Neither object is needed */
+	KMEM_CBRC_DONT_KNOW	= 4,	/* Object unknown */
+} kmem_cbrc_t;
+
+#define	KMC_NOTOUCH		(1 << KMC_BIT_NOTOUCH)
+#define	KMC_NODEBUG		(1 << KMC_BIT_NODEBUG)
+#define	KMC_NOMAGAZINE		(1 << KMC_BIT_NOMAGAZINE)
+#define	KMC_NOHASH		(1 << KMC_BIT_NOHASH)
+#define	KMC_QCACHE		(1 << KMC_BIT_QCACHE)
+#define	KMC_KMEM		(1 << KMC_BIT_KMEM)
+#define	KMC_VMEM		(1 << KMC_BIT_VMEM)
+#define	KMC_SLAB		(1 << KMC_BIT_SLAB)
+#define	KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
+#define	KMC_NOEMERGENCY		(1 << KMC_BIT_NOEMERGENCY)
+#define	KMC_DEADLOCKED		(1 << KMC_BIT_DEADLOCKED)
+#define	KMC_GROWING		(1 << KMC_BIT_GROWING)
+#define	KMC_REAPING		(1 << KMC_BIT_REAPING)
+#define	KMC_DESTROY		(1 << KMC_BIT_DESTROY)
+#define	KMC_TOTAL		(1 << KMC_BIT_TOTAL)
+#define	KMC_ALLOC		(1 << KMC_BIT_ALLOC)
+#define	KMC_MAX			(1 << KMC_BIT_MAX)
+
+#define	KMC_REAP_CHUNK		INT_MAX
+#define	KMC_DEFAULT_SEEKS	1
+
+#define	KMC_EXPIRE_AGE		0x1	/* Due to age */
+#define	KMC_EXPIRE_MEM		0x2	/* Due to low memory */
+
+#define	KMC_RECLAIM_ONCE	0x1	/* Force a single shrinker pass */
+
+extern unsigned int spl_kmem_cache_expire;
+extern struct list_head spl_kmem_cache_list;
+extern struct rw_semaphore spl_kmem_cache_sem;
+
+#define	SKM_MAGIC			0x2e2e2e2e
+#define	SKO_MAGIC			0x20202020
+#define	SKS_MAGIC			0x22222222
+#define	SKC_MAGIC			0x2c2c2c2c
+
+#define	SPL_KMEM_CACHE_DELAY		15	/* Minimum slab release age */
+#define	SPL_KMEM_CACHE_REAP		0	/* Default reap everything */
+#define	SPL_KMEM_CACHE_OBJ_PER_SLAB	8	/* Target objects per slab */
+#define	SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN	1	/* Minimum objects per slab */
+#define	SPL_KMEM_CACHE_ALIGN		8	/* Default object alignment */
+#ifdef _LP64
+#define	SPL_KMEM_CACHE_MAX_SIZE		32	/* Max slab size in MB */
+#else
+#define	SPL_KMEM_CACHE_MAX_SIZE		4	/* Max slab size in MB */
+#endif
+
+#define	SPL_MAX_ORDER			(MAX_ORDER - 3)
+#define	SPL_MAX_ORDER_NR_PAGES		(1 << (SPL_MAX_ORDER - 1))
+
+#ifdef CONFIG_SLUB
+#define	SPL_MAX_KMEM_CACHE_ORDER	PAGE_ALLOC_COSTLY_ORDER
+#define	SPL_MAX_KMEM_ORDER_NR_PAGES	(1 << (SPL_MAX_KMEM_CACHE_ORDER - 1))
+#else
+#define	SPL_MAX_KMEM_ORDER_NR_PAGES	(KMALLOC_MAX_SIZE >> PAGE_SHIFT)
+#endif
+
+#define	POINTER_IS_VALID(p)		0	/* Unimplemented */
+#define	POINTER_INVALIDATE(pp)			/* Unimplemented */
+
+typedef int (*spl_kmem_ctor_t)(void *, void *, int);
+typedef void (*spl_kmem_dtor_t)(void *, void *);
+typedef void (*spl_kmem_reclaim_t)(void *);
+
+typedef struct spl_kmem_magazine {
+	uint32_t		skm_magic;	/* Sanity magic */
+	uint32_t		skm_avail;	/* Available objects */
+	uint32_t		skm_size;	/* Magazine size */
+	uint32_t		skm_refill;	/* Batch refill size */
+	struct spl_kmem_cache	*skm_cache;	/* Owned by cache */
+	unsigned long		skm_age;	/* Last cache access */
+	unsigned int		skm_cpu;	/* Owned by cpu */
+	void			*skm_objs[0];	/* Object pointers */
+} spl_kmem_magazine_t;
+
+typedef struct spl_kmem_obj {
+	uint32_t		sko_magic;	/* Sanity magic */
+	void			*sko_addr;	/* Buffer address */
+	struct spl_kmem_slab	*sko_slab;	/* Owned by slab */
+	struct list_head	sko_list;	/* Free object list linkage */
+} spl_kmem_obj_t;
+
+typedef struct spl_kmem_slab {
+	uint32_t		sks_magic;	/* Sanity magic */
+	uint32_t		sks_objs;	/* Objects per slab */
+	struct spl_kmem_cache	*sks_cache;	/* Owned by cache */
+	struct list_head	sks_list;	/* Slab list linkage */
+	struct list_head	sks_free_list;	/* Free object list */
+	unsigned long		sks_age;	/* Last modify jiffie */
+	uint32_t		sks_ref;	/* Ref count used objects */
+} spl_kmem_slab_t;
+
+typedef struct spl_kmem_alloc {
+	struct spl_kmem_cache	*ska_cache;	/* Owned by cache */
+	int			ska_flags;	/* Allocation flags */
+	taskq_ent_t		ska_tqe;	/* Task queue entry */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+	struct rb_node		ske_node;	/* Emergency tree linkage */
+	unsigned long		ske_obj;	/* Buffer address */
+} spl_kmem_emergency_t;
+
+typedef struct spl_kmem_cache {
+	uint32_t		skc_magic;	/* Sanity magic */
+	uint32_t		skc_name_size;	/* Name length */
+	char			*skc_name;	/* Name string */
+	spl_kmem_magazine_t	*skc_mag[NR_CPUS]; /* Per-CPU warm cache */
+	uint32_t		skc_mag_size;	/* Magazine size */
+	uint32_t		skc_mag_refill;	/* Magazine refill count */
+	spl_kmem_ctor_t		skc_ctor;	/* Constructor */
+	spl_kmem_dtor_t		skc_dtor;	/* Destructor */
+	spl_kmem_reclaim_t	skc_reclaim;	/* Reclaimator */
+	void			*skc_private;	/* Private data */
+	void			*skc_vmp;	/* Unused */
+	struct kmem_cache	*skc_linux_cache; /* Linux slab cache if used */
+	unsigned long		skc_flags;	/* Flags */
+	uint32_t		skc_obj_size;	/* Object size */
+	uint32_t		skc_obj_align;	/* Object alignment */
+	uint32_t		skc_slab_objs;	/* Objects per slab */
+	uint32_t		skc_slab_size;	/* Slab size */
+	uint32_t		skc_delay;	/* Slab reclaim interval */
+	uint32_t		skc_reap;	/* Slab reclaim count */
+	atomic_t		skc_ref;	/* Ref count callers */
+	taskqid_t		skc_taskqid;	/* Slab reclaim task */
+	struct list_head	skc_list;	/* List of caches linkage */
+	struct list_head	skc_complete_list; /* Completely alloc'ed */
+	struct list_head	skc_partial_list;  /* Partially alloc'ed */
+	struct rb_root		skc_emergency_tree; /* Min sized objects */
+	spinlock_t		skc_lock;	/* Cache lock */
+	wait_queue_head_t	skc_waitq;	/* Allocation waiters */
+	uint64_t		skc_slab_fail;	/* Slab alloc failures */
+	uint64_t		skc_slab_create;  /* Slab creates */
+	uint64_t		skc_slab_destroy; /* Slab destroys */
+	uint64_t		skc_slab_total;	/* Slab total current */
+	uint64_t		skc_slab_alloc;	/* Slab alloc current */
+	uint64_t		skc_slab_max;	/* Slab max historic  */
+	uint64_t		skc_obj_total;	/* Obj total current */
+	uint64_t		skc_obj_alloc;	/* Obj alloc current */
+	uint64_t		skc_obj_max;	/* Obj max historic */
+	uint64_t		skc_obj_deadlock;  /* Obj emergency deadlocks */
+	uint64_t		skc_obj_emergency; /* Obj emergency current */
+	uint64_t		skc_obj_emergency_max; /* Obj emergency max */
+} spl_kmem_cache_t;
+#define	kmem_cache_t		spl_kmem_cache_t
+
+extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
+    size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
+    spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
+extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
+    kmem_cbrc_t (*)(void *, void *, size_t, void *));
+extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
+extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
+extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
+extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
+extern void spl_kmem_reap(void);
+
+#define	kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \
+    spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
+#define	kmem_cache_set_move(skc, move)	spl_kmem_cache_set_move(skc, move)
+#define	kmem_cache_destroy(skc)		spl_kmem_cache_destroy(skc)
+#define	kmem_cache_alloc(skc, flags)	spl_kmem_cache_alloc(skc, flags)
+#define	kmem_cache_free(skc, obj)	spl_kmem_cache_free(skc, obj)
+#define	kmem_cache_reap_now(skc)	\
+    spl_kmem_cache_reap_now(skc, skc->skc_reap)
+#define	kmem_reap()			spl_kmem_reap()
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern int spl_kmem_cache_init(void);
+extern void spl_kmem_cache_fini(void);
+
+#endif	/* _SPL_KMEM_CACHE_H */
--- a/include/sys/types.h
+++ b/include/sys/types.h
@ -48,7 +48,6 @@ typedef long long			longlong_t;
 typedef long long			offset_t;
 typedef struct task_struct		kthread_t;
 typedef struct task_struct		proc_t;
-typedef struct vmem { }			vmem_t;
 typedef short				pri_t;
 typedef struct timespec			timestruc_t; /* definition per SVr4 */
 typedef struct timespec			timespec_t;
--- a/include/sys/vmem.h
+++ b/include/sys/vmem.h
@ -0,0 +1,109 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VMEM_H
+#define	_SPL_VMEM_H
+
+#include <sys/kmem.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+typedef struct vmem { } vmem_t;
+
+extern vmem_t *heap_arena;
+extern vmem_t *zio_alloc_arena;
+extern vmem_t *zio_arena;
+
+extern size_t vmem_size(vmem_t *vmp, int typemask);
+extern void *spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot);
+
+/*
+ * Memory allocation interfaces
+ */
+#define	VMEM_ALLOC	0x01
+#define	VMEM_FREE	0x02
+
+#ifndef VMALLOC_TOTAL
+#define	VMALLOC_TOTAL	(VMALLOC_END - VMALLOC_START)
+#endif
+
+/*
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
+ *
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
+ *
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc().  It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ *    Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ *    vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ *    only a few 100MB are available. The kernel will handle it by spinning
+ *    when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ *    lock which serializes all allocations.
+ *
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ *    list. The former will sum the allocations while the latter will print
+ *    them to user space in a way that user space can keep the lock held
+ *    indefinitely.  When the total number of mapped allocations is large
+ *    (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ *    memory is used, it simply does not work on virtual memory.  Certain
+ *    Linux structures (e.g. the superblock) use them and might be embedded
+ *    into a structure from Illumos.  This makes using Linux virtual memory
+ *    unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
+ */
+
+#define	vmem_alloc(sz, fl)	spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define	vmem_zalloc(sz, fl)	spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define	vmem_free(ptr, sz)	spl_vmem_free((ptr), (sz))
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
+
+int spl_vmem_init(void);
+void spl_vmem_fini(void);
+
+#endif	/* _SPL_VMEM_H */
--- a/include/sys/vmsystm.h
+++ b/include/sys/vmsystm.h
@ -37,19 +37,6 @@
 #define	physmem				totalram_pages
 #define	freemem				nr_free_pages()

-extern vmem_t *heap_arena;		/* primary kernel heap arena */
-extern vmem_t *zio_alloc_arena;		/* arena for zio caches */
-extern vmem_t *zio_arena;		/* arena for allocating zio memory */
-
-extern size_t vmem_size(vmem_t *vmp, int typemask);
-
-#define	VMEM_ALLOC	0x01
-#define	VMEM_FREE	0x02
-
-#ifndef VMALLOC_TOTAL
-#define	VMALLOC_TOTAL	(VMALLOC_END - VMALLOC_START)
-#endif
-
 #define	xcopyin(from, to, size)		copy_from_user(to, from, size)
 #define	xcopyout(from, to, size)	copy_to_user(to, from, size)

--- a/man/man5/spl-module-parameters.5
+++ b/man/man5/spl-module-parameters.5
@ -14,70 +14,200 @@ Description of the different parameters to the SPL module.
 .sp
 .LP

-.sp
-.ne 2
-.na
-\fBspl_debug_subsys\fR (ulong)
-.ad
-.RS 12n
-Subsystem debugging level mask.
-.sp
-Default value: \fB~0\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBspl_debug_mask\fR (ulong)
-.ad
-.RS 12n
-Debugging level mask.
-.sp
-Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE).
-.RE
-
-.sp
-.ne 2
-.na
-\fBspl_debug_printk\fR (ulong)
-.ad
-.RS 12n
-Console printk level mask.
-.sp
-Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE).
-.RE
-
-.sp
-.ne 2
-.na
-\fBspl_debug_mb\fR (int)
-.ad
-.RS 12n
-Total debug buffer size.
-.sp
-Default value: \fB-1\fR.
-.RE
-
-.sp
-.ne 2
-.na
-\fBspl_debug_panic_on_bug\fR (int)
-.ad
-.RS 12n
-Panic on BUG
-.sp
-Use \fB1\fR for yes and \fB0\fR for no (default).
-.RE
-
 .sp
 .ne 2
 .na
 \fBspl_kmem_cache_expire\fR (uint)
 .ad
 .RS 12n
-By age (0x1) or low memory (0x2)
+Cache expiration is part of default Illumos cache behavior.  The idea is
+that objects in magazines which have not been recently accessed should be
+returned to the slabs periodically.  This is known as cache aging and
+when enabled objects will be typically returned after 15 seconds.
 .sp
-Default value: \fB0\fR.
+On the other hand Linux slabs are designed to never move objects back to
+the slabs unless there is memory pressure.  This is possible because under
+Linux the cache will be notified when memory is low and objects can be
+released.
+.sp
+By default only the Linux method is enabled.  It has been shown to improve
+responsiveness on low memory systems and not negatively impact the performance
+of systems with more memory.  This policy may be changed by setting the
+\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled
+concurrently.
+.sp
+0x01 - Aging (Illumos), 0x02 - Low memory (Linux)
+.sp
+Default value: \fB0x02\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_reclaim\fR (uint)
+.ad
+.RS 12n
+When this is set it prevents Linux from being able to rapidly reclaim all the
+memory held by the kmem caches.  This may be useful in circumstances where
+it's preferable that Linux reclaim memory from some other subsystem first.
+Setting this will increase the likelihood out of memory events on a memory
+constrained system.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab\fR (uint)
+.ad
+.RS 12n
+The preferred number of objects per slab in the cache.   In general, a larger
+value will increase the caches memory footprint while decreasing the time
+required to perform an allocation.  Conversely, a smaller value will minimize
+the footprint and improve cache reclaim time but individual allocations may
+take longer.
+.sp
+Default value: \fB8\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab_min\fR (uint)
+.ad
+.RS 12n
+The minimum number of objects allowed per slab.  Normally slabs will contain
+\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very
+large objects it's desirable to only have a few, or even just one, object per
+slab.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_max_size\fR (uint)
+.ad
+.RS 12n
+The maximum size of a kmem cache slab in MiB.  This effectively limits
+the maximum cache object size to \fBspl_kmem_cache_max_size\fR /
+\fBspl_kmem_cache_obj_per_slab\fR.  Caches may not be created with
+object sized larger than this limit.
+.sp
+Default value: \fB32 (64-bit) or 4 (32-bit)\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_slab_limit\fR (uint)
+.ad
+.RS 12n
+For small objects the Linux slab allocator should be used to make the most
+efficient use of the memory.  However, large objects are not supported by
+the Linux slab and therefore the SPL implementation is preferred.  This
+value is used to determine the cutoff between a small and large object.
+.sp
+Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated
+using the Linux slab allocator, large objects use the SPL allocator.  A
+cutoff of 16K was determined to be optimal for architectures using 4K pages.
+.sp
+Default value: \fB16,384\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_kmem_limit\fR (uint)
+.ad
+.RS 12n
+Depending on the size of a cache object it may be backed by kmalloc()'d
+or vmalloc()'d memory.  This is because the size of the required allocation
+greatly impacts the best way to allocate the memory.
+.sp
+When objects are small and only a small number of memory pages need to be
+allocated, ideally just one, then kmalloc() is very efficient.  However,
+when allocating multiple pages with kmalloc() it gets increasingly expensive
+because the pages must be physically contiguous.
+.sp
+For this reason we shift to vmalloc() for slabs of large objects which
+which removes the need for contiguous pages.  We cannot use vmalloc() in
+all cases because there is significant locking overhead involved.  This
+function takes a single global lock over the entire virtual address range
+which serializes all allocations.  Using slightly different allocation
+functions for small and large objects allows us to handle a wide range of
+object sizes.
+.sh
+The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff
+size.  One quarter the PAGE_SIZE is used as the default value because
+\fBspl_kmem_cache_obj_per_slab\fR defaults to 16.  This means that at
+most we will need to allocate four contiguous pages.
+.sp
+Default value: \fBPAGE_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_warn\fR (uint)
+.ad
+.RS 12n
+As a general rule kmem_alloc() allocations should be small, preferably
+just a few pages since they must by physically contiguous.  Therefore, a
+rate limited warning will be printed to the console for any kmem_alloc()
+which exceeds a reasonable threshold.
+.sp
+The default warning threshold is set to eight pages but capped at 32K to
+accommodate systems using large pages.  This value was selected to be small
+enough to ensure the largest allocations are quickly noticed and fixed.
+But large enough to avoid logging any warnings when a allocation size is
+larger than optimal but not a serious concern.  Since this value is tunable,
+developers are encouraged to set it lower when testing so any new largish
+allocations are quickly caught.  These warnings may be disabled by setting
+the threshold to zero.
+.sp
+Default value: \fB32,768\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_max\fR (uint)
+.ad
+.RS 12n
+Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+Allocations which are marginally smaller than this limit may succeed but
+should still be avoided due to the expense of locating a contiguous range
+of free pages.  Therefore, a maximum kmem size with reasonable safely
+margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
+will quickly fail.  Vmem_alloc() allocations less than or equal to this
+value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+.sp
+Default value: \fBKMALLOC_MAX_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_magazine_size\fR (uint)
+.ad
+.RS 12n
+Cache magazines are an optimization designed to minimize the cost of
+allocating memory.  They do this by keeping a per-cpu cache of recently
+freed objects, which can then be reallocated without taking a lock. This
+can improve performance on highly contended caches.  However, because
+objects in magazines will prevent otherwise empty slabs from being
+immediately released this may not be ideal for low memory machines.
+.sp
+For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a
+maximum magazine size.  When this value is set to 0 the magazine size will
+be automatically determined based on the object size.  Otherwise magazines
+will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
+may never be entirely disabled in this implementation.
+.sp
+Default value: \fB0\fR
 .RE

 .sp
@ -86,9 +216,12 @@ Default value: \fB0\fR.
 \fBspl_hostid\fR (ulong)
 .ad
 .RS 12n
-The system hostid.
+The system hostid, when set this can be used to uniquely identify a system.
+By default this value is set to zero which indicates the hostid is disabled.
+It can be explicitly enabled by placing a unique non-zero value in
+\fB/etc/hostid/\fR.
 .sp
-Default value: \fB0xFFFFFFFF\fR (an invalid hostid!)
+Default value: \fB0\fR
 .RE

 .sp
@ -97,9 +230,10 @@ Default value: \fB0xFFFFFFFF\fR (an invalid hostid!)
 \fBspl_hostid_path\fR (charp)
 .ad
 .RS 12n
-The system hostid file
+The expected path to locate the system hostid when specified.  This value
+may be overridden for non-standard configurations.
 .sp
-Default value: \fB/etc/hostid\fR.
+Default value: \fB/etc/hostid\fR
 .RE

 .sp
@ -108,7 +242,10 @@ Default value: \fB/etc/hostid\fR.
 \fBspl_taskq_thread_bind\fR (int)
 .ad
 .RS 12n
-Bind taskq thread to CPU
+Bind taskq threads to specific CPUs.  When enabled all taskq threads will
+be distributed evenly  over the available CPUs.  By default, this behavior
+is disabled to allow the Linux scheduler the maximum flexibility to determine
+where a thread should run.
 .sp
-Default value: \fB0\fR.
+Default value: \fB0\fR
 .RE
--- a/module/spl/Makefile.in
+++ b/module/spl/Makefile.in
@ -8,6 +8,8 @@ obj-$(CONFIG_SPL) := $(MODULE).o

 $(MODULE)-objs += @top_srcdir@/module/spl/spl-proc.o
 $(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem.o
+$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem-cache.o
+$(MODULE)-objs += @top_srcdir@/module/spl/spl-vmem.o
 $(MODULE)-objs += @top_srcdir@/module/spl/spl-thread.o
 $(MODULE)-objs += @top_srcdir@/module/spl/spl-taskq.o
 $(MODULE)-objs += @top_srcdir@/module/spl/spl-rwlock.o
--- a/module/spl/spl-condvar.c
+++ b/module/spl/spl-condvar.c
@ -25,6 +25,7 @@
 \*****************************************************************************/

 #include <sys/condvar.h>
+#include <sys/time.h>

 void
 __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
--- a/module/spl/spl-generic.c
+++ b/module/spl/spl-generic.c
@ -29,6 +29,8 @@
 #include <sys/vmsystm.h>
 #include <sys/kobj.h>
 #include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/taskq.h>
@ -38,6 +40,7 @@
 #include <sys/proc.h>
 #include <sys/kstat.h>
 #include <sys/file.h>
+#include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/math64_compat.h>
 #include <linux/proc_compat.h>
@ -479,12 +482,46 @@ zone_get_hostid(void *zone)
 }
 EXPORT_SYMBOL(zone_get_hostid);

+static int
+spl_kvmem_init(void)
+{
+	int rc = 0;
+
+	rc = spl_kmem_init();
+	if (rc)
+		goto out1;
+
+	rc = spl_vmem_init();
+	if (rc)
+		goto out2;
+
+	rc = spl_kmem_cache_init();
+	if (rc)
+		goto out3;
+
+	return (rc);
+out3:
+	spl_vmem_fini();
+out2:
+	spl_kmem_fini();
+out1:
+	return (rc);
+}
+
+static void
+spl_kvmem_fini(void)
+{
+	spl_kmem_cache_fini();
+	spl_vmem_fini();
+	spl_kmem_fini();
+}
+
 static int
 __init spl_init(void)
 {
 	int rc = 0;

-	if ((rc = spl_kmem_init()))
+	if ((rc = spl_kvmem_init()))
 		goto out1;

 	if ((rc = spl_mutex_init()))
@ -530,7 +567,7 @@ out4:
 out3:
 	spl_mutex_fini();
 out2:
-	spl_kmem_fini();
+	spl_kvmem_fini();
 out1:
 	printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer "
 	       "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE,
@ -552,7 +589,7 @@ spl_fini(void)
 	spl_taskq_fini();
 	spl_rw_fini();
 	spl_mutex_fini();
-	spl_kmem_fini();
+	spl_kvmem_fini();
 }

 /* Called when a dependent module is loaded */
--- a/module/spl/spl-kmem-cache.c
+++ b/module/spl/spl-kmem-cache.c
--- a/module/spl/spl-kmem.c
+++ b/module/spl/spl-kmem.c
--- a/module/spl/spl-kstat.c
+++ b/module/spl/spl-kstat.c
@ -26,6 +26,7 @@

 #include <linux/seq_file.h>
 #include <sys/kstat.h>
+#include <sys/vmem.h>

 #ifndef HAVE_PDE_DATA
 #define PDE_DATA(x) (PDE(x)->data)
--- a/module/spl/spl-proc.c
+++ b/module/spl/spl-proc.c
@ -26,9 +26,14 @@

 #include <sys/systeminfo.h>
 #include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
 #include <linux/proc_compat.h>
+#include <linux/uaccess.h>
 #include <linux/version.h>

 #if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
@ -348,26 +353,6 @@ static struct ctl_table spl_kmem_table[] = {
                .mode     = 0444,
                .proc_handler = &proc_doulongvec_minmax,
        },
-        {
-                .procname = "vmem_used",
-                .data     = &vmem_alloc_used,
-# ifdef HAVE_ATOMIC64_T
-                .maxlen   = sizeof(atomic64_t),
-# else
-                .maxlen   = sizeof(atomic_t),
-# endif /* HAVE_ATOMIC64_T */
-                .mode     = 0444,
-                .proc_handler = &proc_domemused,
-        },
-        {
-                .procname = "vmem_max",
-                .data     = &vmem_alloc_max,
-                .maxlen   = sizeof(unsigned long),
-                .extra1   = &table_min,
-                .extra2   = &table_max,
-                .mode     = 0444,
-                .proc_handler = &proc_doulongvec_minmax,
-        },
        {
                .procname = "slab_kmem_total",
 		.data     = (void *)(KMC_KMEM | KMC_TOTAL),
--- a/module/spl/spl-tsd.c
+++ b/module/spl/spl-tsd.c
@ -61,6 +61,7 @@
 #include <sys/kmem.h>
 #include <sys/thread.h>
 #include <sys/tsd.h>
+#include <linux/hash.h>

 typedef struct tsd_hash_bin {
 	spinlock_t		hb_lock;
@ -336,8 +337,7 @@ tsd_hash_table_init(uint_t bits)
 	if (table == NULL)
 		return (NULL);

-	table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size,
-	    KM_SLEEP | KM_NODEBUG);
+	table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP);
 	if (table->ht_bins == NULL) {
 		kmem_free(table, sizeof(tsd_hash_table_t));
 		return (NULL);
--- a/module/spl/spl-vmem.c
+++ b/module/spl/spl-vmem.c
@ -0,0 +1,134 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *  For details, see <http://zfsonlinux.org/>.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <linux/mm_compat.h>
+#include <linux/module.h>
+
+vmem_t *heap_arena = NULL;
+EXPORT_SYMBOL(heap_arena);
+
+vmem_t *zio_alloc_arena = NULL;
+EXPORT_SYMBOL(zio_alloc_arena);
+
+vmem_t *zio_arena = NULL;
+EXPORT_SYMBOL(zio_arena);
+
+size_t
+vmem_size(vmem_t *vmp, int typemask)
+{
+	ASSERT3P(vmp, ==, NULL);
+	ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC);
+	ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE);
+
+	return (VMALLOC_TOTAL);
+}
+EXPORT_SYMBOL(vmem_size);
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+/*
+ * Public vmalloc() interface designed to be safe to be called during I/O.
+ */
+void *
+spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot)
+{
+#if defined(PF_MEMALLOC_NOIO)
+	void *ptr;
+	unsigned noio_flag = 0;
+
+	if (spl_fstrans_check())
+		noio_flag = memalloc_noio_save();
+
+	ptr =  __vmalloc(size, lflags, prot);
+
+	if (spl_fstrans_check())
+		memalloc_noio_restore(noio_flag);
+
+	return (ptr);
+#else
+	return (__vmalloc(size, lflags, prot));
+#endif
+}
+EXPORT_SYMBOL(spl_vmalloc);
+
+int
+spl_vmem_init(void)
+{
+	return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
--- a/module/spl/spl-vnode.c
+++ b/module/spl/spl-vnode.c
@ -26,6 +26,7 @@

 #include <sys/cred.h>
 #include <sys/vnode.h>
+#include <sys/kmem_cache.h>
 #include <linux/falloc.h>
 #include <linux/file_compat.h>

--- a/module/spl/spl-zlib.c
+++ b/module/spl/spl-zlib.c
@ -54,6 +54,7 @@


 #include <sys/kmem.h>
+#include <sys/kmem_cache.h>
 #include <sys/zmod.h>
 #include <linux/zlib_compat.h>

--- a/module/splat/splat-condvar.c
+++ b/module/splat/splat-condvar.c
@ -24,8 +24,9 @@
 *  Solaris Porting LAyer Tests (SPLAT) Condition Variable Tests.
 \*****************************************************************************/

-#include <linux/kthread.h>
 #include <sys/condvar.h>
+#include <sys/timer.h>
+#include <sys/thread.h>
 #include "splat-internal.h"

 #define SPLAT_CONDVAR_NAME		"condvar"
--- a/module/splat/splat-internal.h
+++ b/module/splat/splat-internal.h
@ -27,6 +27,7 @@

 #include "splat-ctl.h"
 #include <sys/mutex.h>
+#include <linux/file_compat.h>

 #define SPLAT_SUBSYSTEM_INIT(type)                                      \
 ({      splat_subsystem_t *_sub_;                                       \
--- a/module/splat/splat-kmem.c
+++ b/module/splat/splat-kmem.c
@ -25,7 +25,11 @@
 \*****************************************************************************/

 #include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/random.h>
 #include <sys/thread.h>
+#include <sys/vmsystm.h>
 #include "splat-internal.h"

 #define SPLAT_KMEM_NAME			"kmem"
@ -92,11 +96,11 @@ splat_kmem_test1(struct file *file, void *arg)
 	int size = PAGE_SIZE;
 	int i, count, rc = 0;

-	while ((!rc) && (size <= (PAGE_SIZE * 32))) {
+	while ((!rc) && (size <= spl_kmem_alloc_warn)) {
 		count = 0;

 		for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) {
-			ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG);
+			ptr[i] = kmem_alloc(size, KM_SLEEP);
 			if (ptr[i])
 				count++;
 		}
@ -124,11 +128,11 @@ splat_kmem_test2(struct file *file, void *arg)
 	int size = PAGE_SIZE;
 	int i, j, count, rc = 0;

-	while ((!rc) && (size <= (PAGE_SIZE * 32))) {
+	while ((!rc) && (size <= spl_kmem_alloc_warn)) {
 		count = 0;

 		for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) {
-			ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG);
+			ptr[i] = kmem_zalloc(size, KM_SLEEP);
 			if (ptr[i])
 				count++;
 		}
@ -168,7 +172,11 @@ splat_kmem_test3(struct file *file, void *arg)
 	int size = PAGE_SIZE;
 	int i, count, rc = 0;

-	while ((!rc) && (size <= (PAGE_SIZE * 1024))) {
+	/*
+	 * Test up to 4x the maximum kmem_alloc() size to ensure both
+	 * the kmem_alloc() and vmem_alloc() call paths are used.
+	 */
+	while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) {
 		count = 0;

 		for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) {
@ -200,7 +208,11 @@ splat_kmem_test4(struct file *file, void *arg)
 	int size = PAGE_SIZE;
 	int i, j, count, rc = 0;

-	while ((!rc) && (size <= (PAGE_SIZE * 1024))) {
+	/*
+	 * Test up to 4x the maximum kmem_zalloc() size to ensure both
+	 * the kmem_zalloc() and vmem_zalloc() call paths are used.
+	 */
+	while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) {
 		count = 0;

 		for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) {
@ -572,87 +584,124 @@ out:

 static int
 splat_kmem_cache_test(struct file *file, void *arg, char *name,
-		      int size, int align, int flags)
+    int size, int align, int flags)
 {
-	kmem_cache_priv_t *kcp;
-	kmem_cache_data_t *kcd = NULL;
-	int rc = 0, max;
+	kmem_cache_priv_t *kcp = NULL;
+	kmem_cache_data_t **kcd = NULL;
+	int i, rc = 0, objs = 0;
+
+	splat_vprint(file, name,
+	    "Testing size=%d, align=%d, flags=0x%04x\n",
+	    size, align, flags);

 	kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0);
 	if (!kcp) {
 		splat_vprint(file, name, "Unable to create '%s'\n", "kcp");
-		return -ENOMEM;
+		return (-ENOMEM);
 	}

-	kcp->kcp_cache =
-		kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
-				  kcp->kcp_size, kcp->kcp_align,
-				  splat_kmem_cache_test_constructor,
-				  splat_kmem_cache_test_destructor,
-				  NULL, kcp, NULL, flags);
-	if (!kcp->kcp_cache) {
-		splat_vprint(file, name,
-			     "Unable to create '%s'\n",
-			     SPLAT_KMEM_CACHE_NAME);
+	kcp->kcp_cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME,
+	    kcp->kcp_size, kcp->kcp_align,
+	    splat_kmem_cache_test_constructor,
+	    splat_kmem_cache_test_destructor,
+	    NULL, kcp, NULL, flags);
+	if (kcp->kcp_cache == NULL) {
+		splat_vprint(file, name, "Unable to create "
+		    "name='%s', size=%d, align=%d, flags=0x%x\n",
+		    SPLAT_KMEM_CACHE_NAME, size, align, flags);
 		rc = -ENOMEM;
 		goto out_free;
 	}

-	kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
-	if (!kcd) {
-		splat_vprint(file, name,
-			     "Unable to allocate from '%s'\n",
-			     SPLAT_KMEM_CACHE_NAME);
-		rc = -EINVAL;
+	/*
+	 * Allocate several slabs worth of objects to verify functionality.
+	 * However, on 32-bit systems with limited address space constrain
+	 * it to a single slab for the purposes of this test.
+	 */
+#ifdef _LP64
+	objs = SPL_KMEM_CACHE_OBJ_PER_SLAB * 4;
+#else
+	objs = 1;
+#endif
+	kcd = kmem_zalloc(sizeof (kmem_cache_data_t *) * objs, KM_SLEEP);
+	if (kcd == NULL) {
+		splat_vprint(file, name, "Unable to allocate pointers "
+		    "for %d objects\n", objs);
+		rc = -ENOMEM;
 		goto out_free;
 	}

-	if (!kcd->kcd_flag) {
-		splat_vprint(file, name,
-			     "Failed to run contructor for '%s'\n",
-			     SPLAT_KMEM_CACHE_NAME);
-		rc = -EINVAL;
-		goto out_free;
+	for (i = 0; i < objs; i++) {
+		kcd[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP);
+		if (kcd[i] == NULL) {
+			splat_vprint(file, name, "Unable to allocate "
+			    "from '%s'\n", SPLAT_KMEM_CACHE_NAME);
+			rc = -EINVAL;
+			goto out_free;
+		}
+
+		if (!kcd[i]->kcd_flag) {
+			splat_vprint(file, name, "Failed to run constructor "
+			    "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+			rc = -EINVAL;
+			goto out_free;
+		}
+
+		if (kcd[i]->kcd_magic != kcp->kcp_magic) {
+			splat_vprint(file, name,
+			    "Failed to pass private data to constructor "
+			    "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+			rc = -EINVAL;
+			goto out_free;
+		}
 	}

-	if (kcd->kcd_magic != kcp->kcp_magic) {
-		splat_vprint(file, name,
-			     "Failed to pass private data to constructor "
-			     "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
-		rc = -EINVAL;
-		goto out_free;
+	for (i = 0; i < objs; i++) {
+		kmem_cache_free(kcp->kcp_cache, kcd[i]);
+
+		/* Destructors are run for every kmem_cache_free() */
+		if (kcd[i]->kcd_flag) {
+			splat_vprint(file, name,
+			    "Failed to run destructor for '%s'\n",
+			    SPLAT_KMEM_CACHE_NAME);
+			rc = -EINVAL;
+			goto out_free;
+		}
 	}

-	max = kcp->kcp_count;
-	kmem_cache_free(kcp->kcp_cache, kcd);
-
-	/* Destroy the entire cache which will force destructors to
-	 * run and we can verify one was called for every object */
-	kmem_cache_destroy(kcp->kcp_cache);
 	if (kcp->kcp_count) {
 		splat_vprint(file, name,
-			     "Failed to run destructor on all slab objects "
-			     "for '%s'\n", SPLAT_KMEM_CACHE_NAME);
+		    "Failed to run destructor on all slab objects for '%s'\n",
+		    SPLAT_KMEM_CACHE_NAME);
 		rc = -EINVAL;
 	}

+	kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs);
+	kmem_cache_destroy(kcp->kcp_cache);
+
 	splat_kmem_cache_test_kcp_free(kcp);
 	splat_vprint(file, name,
-		     "Successfully ran ctors/dtors for %d elements in '%s'\n",
-		     max, SPLAT_KMEM_CACHE_NAME);
+	    "Success ran alloc'd/free'd %d objects of size %d\n",
+	    objs, size);

-	return rc;
+	return (rc);

 out_free:
-	if (kcd)
-		kmem_cache_free(kcp->kcp_cache, kcd);
+	if (kcd) {
+		for (i = 0; i < objs; i++) {
+			if (kcd[i] != NULL)
+				kmem_cache_free(kcp->kcp_cache, kcd[i]);
+		}
+
+		kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs);
+	}

 	if (kcp->kcp_cache)
 		kmem_cache_destroy(kcp->kcp_cache);

 	splat_kmem_cache_test_kcp_free(kcp);

-	return rc;
+	return (rc);
 }

 static int
@ -746,35 +795,49 @@ static int
 splat_kmem_test5(struct file *file, void *arg)
 {
 	char *name = SPLAT_KMEM_TEST5_NAME;
-	int rc;
+	int i, rc = 0;

-	/* On slab (default + kmem + vmem) */
-	rc = splat_kmem_cache_test(file, arg, name, 128, 0, 0);
-	if (rc)
-		return rc;
+	/* Randomly pick small object sizes and alignments. */
+	for (i = 0; i < 100; i++) {
+		int size, align, flags = 0;
+		uint32_t rnd;

-	rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_KMEM);
-	if (rc)
-		return rc;
+		/* Evenly distribute tests over all value cache types */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		switch (rnd & 0x03) {
+		default:
+		case 0x00:
+			flags = 0;
+			break;
+		case 0x01:
+			flags = KMC_KMEM;
+			break;
+		case 0x02:
+			flags = KMC_VMEM;
+			break;
+		case 0x03:
+			flags = KMC_SLAB;
+			break;
+		}

-	rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_VMEM);
-	if (rc)
-		return rc;
+		/* The following flags are set with a 1/10 chance */
+		flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0);
+		flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0);

-	/* Off slab (default + kmem + vmem) */
-	rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_OFFSLAB);
-	if (rc)
-		return rc;
+		/* 32b - PAGE_SIZE */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		size = MAX(rnd % (PAGE_SIZE + 1), 32);

-	rc = splat_kmem_cache_test(file, arg, name, 128, 0,
-	    KMC_KMEM | KMC_OFFSLAB);
-	if (rc)
-		return rc;
+		/* 2^N where (3 <= N <= PAGE_SHIFT) */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1)));

-	rc = splat_kmem_cache_test(file, arg, name, 128, 0,
-	    KMC_VMEM | KMC_OFFSLAB);
+		rc = splat_kmem_cache_test(file, arg, name, size, align, flags);
+		if (rc)
+			return (rc);
+	}

-	return rc;
+	return (rc);
 }

 /*
@ -784,44 +847,53 @@ static int
 splat_kmem_test6(struct file *file, void *arg)
 {
 	char *name = SPLAT_KMEM_TEST6_NAME;
-	int rc;
+	int i, max_size, rc = 0;

-	/* On slab (default + kmem + vmem) */
-	rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, 0);
-	if (rc)
-		return rc;
+	/* Randomly pick large object sizes and alignments. */
+	for (i = 0; i < 100; i++) {
+		int size, align, flags = 0;
+		uint32_t rnd;

-	rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, KMC_KMEM);
-	if (rc)
-		return rc;
+		/* Evenly distribute tests over all value cache types */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		switch (rnd & 0x03) {
+		default:
+		case 0x00:
+			flags = 0;
+			max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2;
+			break;
+		case 0x01:
+			flags = KMC_KMEM;
+			max_size = (SPL_MAX_ORDER_NR_PAGES - 2) * PAGE_SIZE;
+			break;
+		case 0x02:
+			flags = KMC_VMEM;
+			max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2;
+			break;
+		case 0x03:
+			flags = KMC_SLAB;
+			max_size = SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE;
+			break;
+		}

-	rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0, KMC_VMEM);
-	if (rc)
-		return rc;
+		/* The following flags are set with a 1/10 chance */
+		flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0);
+		flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0);

-	rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0, KMC_VMEM);
-	if (rc)
-		return rc;
+		/* PAGE_SIZE - max_size */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		size = MAX(rnd % (max_size + 1), PAGE_SIZE),

-	/* Off slab (default + kmem + vmem) */
-	rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, KMC_OFFSLAB);
-	if (rc)
-		return rc;
+		/* 2^N where (3 <= N <= PAGE_SHIFT) */
+		get_random_bytes((void *)&rnd, sizeof (uint32_t));
+		align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1)));

-	rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0,
-	    KMC_KMEM | KMC_OFFSLAB);
-	if (rc)
-		return rc;
+		rc = splat_kmem_cache_test(file, arg, name, size, align, flags);
+		if (rc)
+			return (rc);
+	}

-	rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0,
-	    KMC_VMEM | KMC_OFFSLAB);
-	if (rc)
-		return rc;
-
-	rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0,
-	    KMC_VMEM | KMC_OFFSLAB);
-
-	return rc;
+	return (rc);
 }

 /*
@ -831,14 +903,20 @@ static int
 splat_kmem_test7(struct file *file, void *arg)
 {
 	char *name = SPLAT_KMEM_TEST7_NAME;
+	int max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2;
 	int i, rc;

 	for (i = SPL_KMEM_CACHE_ALIGN; i <= PAGE_SIZE; i *= 2) {
-		rc = splat_kmem_cache_test(file, arg, name, 157, i, 0);
+		uint32_t size;
+
+		get_random_bytes((void *)&size, sizeof (uint32_t));
+		size = MAX(size % (max_size + 1), 32);
+
+		rc = splat_kmem_cache_test(file, arg, name, size, i, 0);
 		if (rc)
 			return rc;

-		rc = splat_kmem_cache_test(file, arg, name, 157, i,
+		rc = splat_kmem_cache_test(file, arg, name, size, i,
 		    KMC_OFFSLAB);
 		if (rc)
 			return rc;
--- a/module/splat/splat-taskq.c
+++ b/module/splat/splat-taskq.c
@ -25,8 +25,10 @@
 \*****************************************************************************/

 #include <sys/kmem.h>
+#include <sys/vmem.h>
 #include <sys/random.h>
 #include <sys/taskq.h>
+#include <sys/timer.h>
 #include <linux/delay.h>
 #include "splat-internal.h"

--- a/module/splat/splat-zlib.c
+++ b/module/splat/splat-zlib.c
@ -27,6 +27,7 @@
 #include <sys/zmod.h>
 #include <sys/random.h>
 #include <sys/kmem.h>
+#include <sys/vmem.h>
 #include "splat-internal.h"

 #define SPLAT_ZLIB_NAME			"zlib"