From 9ab1ac14ad955800ca070abc11dd031244efb65f Mon Sep 17 00:00:00 2001
From: behlendo <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
Date: Mon, 5 May 2008 20:18:49 +0000
Subject: [PATCH] Commit adaptive mutexes.  This seems to have introduced some
 new crashes but it's not clear to me yet if these are a problem with the
 mutex implementation or ZFSs usage of it.

Minor taskq fixes to add new tasks to the end of the pending list.

Minor enhansements to the debug infrastructure.


git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@94 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c
---
 ChangeLog                 |   7 ++
 include/sys/debug.h       |  11 +-
 include/sys/kmem.h        |  16 +--
 include/sys/mutex.h       | 217 ++++++++++----------------------
 modules/spl/Makefile.in   |   1 +
 modules/spl/spl-generic.c |  18 ++-
 modules/spl/spl-mutex.c   | 256 ++++++++++++++++++++++++++++++++++++++
 modules/spl/spl-proc.c    | 234 +++++++++++++++++++++++++++++-----
 modules/spl/spl-taskq.c   |   6 +-
 9 files changed, 563 insertions(+), 203 deletions(-)
 create mode 100644 modules/spl/spl-mutex.c

diff --git a/ChangeLog b/ChangeLog
index 0106bbd5e..a65d6b15d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2008-04-26 Brian Behlendorf <behlendorf1@llnl.gov>
+
+	* include/sys/mutex.h : Implemented a close approximation
+	of adaptive mutexes.  These changes however required me to 
+	export a new symbol from the kernel proper 'task_curr()'
+	which means we are now dependant on a patched kernel.
+
 2008-04-24 Brian Behlendorf <behlendorf1@llnl.gov>
 
 	* : Tag spl-0.2.1 
diff --git a/include/sys/debug.h b/include/sys/debug.h
index 64aa1808f..39585ba19 100644
--- a/include/sys/debug.h
+++ b/include/sys/debug.h
@@ -310,16 +310,19 @@ do {                                                                    \
         return RETURN__ret;                                             \
 } while (0)
 
-#define ENTRY                                                           \
+#define __ENTRY(subsys)                                                 \
 do {                                                                    \
-        CDEBUG(D_TRACE, "Process entered\n");                           \
+        __CDEBUG(NULL, subsys, D_TRACE, "Process entered\n");           \
 } while (0)
 
-#define EXIT                                                            \
+#define __EXIT(subsys)                                                  \
 do {                                                                    \
-        CDEBUG(D_TRACE, "Process leaving\n");                           \
+        __CDEBUG(NULL, subsys, D_TRACE, "Process leaving\n");           \
 } while(0)
 
+#define ENTRY				__ENTRY(DEBUG_SUBSYSTEM)
+#define EXIT                            __EXIT(DEBUG_SUBSYSTEM)
+
 extern int spl_debug_vmsg(spl_debug_limit_state_t *cdls, int subsys, int mask,
                           const char *file, const char *fn, const int line,
                           const char *format1, va_list args, const char *format2, ...);
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
index 73965c58b..cc56ddd36 100644
--- a/include/sys/kmem.h
+++ b/include/sys/kmem.h
@@ -54,9 +54,9 @@ extern int kmem_warning_flag;
                 if (unlikely(atomic64_read(&kmem_alloc_used)>kmem_alloc_max)) \
                         kmem_alloc_max = atomic64_read(&kmem_alloc_used);     \
 			                                                      \
-                __CDEBUG_LIMIT(S_KMEM, D_INFO, "kmem_alloc(%d, 0x%x)'d "      \
+                __CDEBUG_LIMIT(S_KMEM, D_INFO, "kmem_alloc(%d, 0x%x) = %p "   \
 			       "(%ld/%ld)\n", (int)(size), (int)(flags),      \
-		               atomic64_read(&kmem_alloc_used),               \
+		               _ptr_, atomic64_read(&kmem_alloc_used),        \
 			       kmem_alloc_max);                               \
         }                                                                     \
                                                                               \
@@ -70,8 +70,8 @@ extern int kmem_warning_flag;
 ({                                                                            \
         ASSERT((ptr) || (size > 0));                                          \
         atomic64_sub((size), &kmem_alloc_used);                               \
-        __CDEBUG_LIMIT(S_KMEM, D_INFO, "kmem_free(%d)'d (%ld/%ld)\n",         \
-		       (int)(size), atomic64_read(&kmem_alloc_used),          \
+        __CDEBUG_LIMIT(S_KMEM, D_INFO, "kmem_free(%p, %d) (%ld/%ld)\n",       \
+		       (ptr), (int)(size), atomic64_read(&kmem_alloc_used),   \
 		       kmem_alloc_max);                                       \
         memset(ptr, 0x5a, (size)); /* Poison */                               \
         kfree(ptr);                                                           \
@@ -99,9 +99,9 @@ extern int kmem_warning_flag;
                 if (unlikely(atomic64_read(&vmem_alloc_used)>vmem_alloc_max)) \
                         vmem_alloc_max = atomic64_read(&vmem_alloc_used);     \
                                                                               \
-                __CDEBUG_LIMIT(S_KMEM, D_INFO, "vmem_alloc(%d, 0x%x)'d "      \
+                __CDEBUG_LIMIT(S_KMEM, D_INFO, "vmem_alloc(%d, 0x%x) = %p "   \
 			       "(%ld/%ld)\n", (int)(size), (int)(flags),      \
-		               atomic64_read(&vmem_alloc_used),               \
+		               _ptr_, atomic64_read(&vmem_alloc_used),        \
 			       vmem_alloc_max);                               \
         }                                                                     \
                                                                               \
@@ -116,8 +116,8 @@ extern int kmem_warning_flag;
 ({                                                                            \
         ASSERT((ptr) || (size > 0));                                          \
         atomic64_sub((size), &vmem_alloc_used);                               \
-        __CDEBUG_LIMIT(S_KMEM, D_INFO, "vmem_free(%d)'d (%ld/%ld)\n",         \
-		       (int)(size), atomic64_read(&vmem_alloc_used),          \
+        __CDEBUG_LIMIT(S_KMEM, D_INFO, "vmem_free(%p, %d) (%ld/%ld)\n",       \
+		       (ptr), (int)(size), atomic64_read(&vmem_alloc_used),   \
 		       vmem_alloc_max);                                       \
         memset(ptr, 0x5a, (size)); /* Poison */                               \
         vfree(ptr);                                                           \
diff --git a/include/sys/mutex.h b/include/sys/mutex.h
index 045842d72..fd787bb9d 100644
--- a/include/sys/mutex.h
+++ b/include/sys/mutex.h
@@ -8,175 +8,86 @@ extern "C" {
 #include <linux/module.h>
 #include <linux/hardirq.h>
 #include <sys/types.h>
+#include <sys/kmem.h>
 
-/* See the "Big Theory Statement" in solaris mutex.c.
- *
- * Spin mutexes apparently aren't needed by zfs so we assert
- * if ibc is non-zero.
- *
- * Our impementation of adaptive mutexes aren't really adaptive.
- * They go to sleep every time.
- */
+//#define DEBUG_MUTEX
+#undef DEBUG_MUTEX
 
 #define MUTEX_DEFAULT		0
-#define MUTEX_HELD(x)           (mutex_owned(x))
+#define MUTEX_SPIN		1
+#define MUTEX_ADAPTIVE		2
+
+#define MUTEX_ENTER_TOTAL	0
+#define MUTEX_ENTER_NOT_HELD	1
+#define MUTEX_ENTER_SPIN	2
+#define MUTEX_ENTER_SLEEP	3
+#define MUTEX_TRYENTER_TOTAL	4
+#define MUTEX_TRYENTER_NOT_HELD	5
+#define MUTEX_STATS_SIZE	6
 
 #define KM_MAGIC		0x42424242
 #define KM_POISON		0x84
 
 typedef struct {
-	int km_magic;
+	int32_t km_magic;
+	int16_t km_type;
+	int16_t km_name_size;
 	char *km_name;
 	struct task_struct *km_owner;
-	struct semaphore km_sem;
-	spinlock_t km_lock;
+	struct semaphore *km_sem;
+#ifdef DEBUG_MUTEX
+	int *km_stats;
+	struct list_head km_list;
+#endif
 } kmutex_t;
 
+extern int mutex_spin_max;
+
+#ifdef DEBUG_MUTEX
+extern int mutex_stats[MUTEX_STATS_SIZE];
+extern struct mutex mutex_stats_lock;
+extern struct list_head mutex_stats_list;
+#define MUTEX_STAT_INC(stats, stat)	((stats)[stat]++)
+#else
+#define MUTEX_STAT_INC(stats, stat)
+#endif
+
+int spl_mutex_init(void);
+void spl_mutex_fini(void);
+
+extern void __spl_mutex_init(kmutex_t *mp, char *name, int type, void *ibc);
+extern void __spl_mutex_destroy(kmutex_t *mp);
+extern int __mutex_tryenter(kmutex_t *mp);
+extern void __mutex_enter(kmutex_t *mp);
+extern void __mutex_exit(kmutex_t *mp);
+extern int __mutex_owned(kmutex_t *mp);
+extern kthread_t *__spl_mutex_owner(kmutex_t *mp);
+
 #undef mutex_init
-static __inline__ void
-mutex_init(kmutex_t *mp, char *name, int type, void *ibc)
-{
-	ENTRY;
-	ASSERT(mp);
-	ASSERT(ibc == NULL);		/* XXX - Spin mutexes not needed */
-	ASSERT(type == MUTEX_DEFAULT);	/* XXX - Only default type supported */
-
-	mp->km_magic = KM_MAGIC;
-	spin_lock_init(&mp->km_lock);
-	sema_init(&mp->km_sem, 1);
-	mp->km_owner = NULL;
-	mp->km_name = NULL;
-
-	if (name) {
-		mp->km_name = kmalloc(strlen(name) + 1, GFP_KERNEL);
-		if (mp->km_name)
-			strcpy(mp->km_name, name);
-	}
-	EXIT;
-}
-
 #undef mutex_destroy
-static __inline__ void
-mutex_destroy(kmutex_t *mp)
-{
-	ENTRY;
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
 
-	if (mp->km_name)
-		kfree(mp->km_name);
+#define mutex_init(mp, name, type, ibc)					\
+({									\
+        __ENTRY(S_MUTEX);                                               \
+	if ((name) == NULL)						\
+		__spl_mutex_init(mp, #mp, type, ibc);			\
+	else								\
+		__spl_mutex_init(mp, name, type, ibc);			\
+        __EXIT(S_MUTEX);                                                \
+})
+#define mutex_destroy(mp)						\
+({									\
+        __ENTRY(S_MUTEX);                                               \
+	__spl_mutex_destroy(mp);                                        \
+        __EXIT(S_MUTEX);                                                \
+})
 
-	memset(mp, KM_POISON, sizeof(*mp));
-	spin_unlock(&mp->km_lock);
-	EXIT;
-}
-
-static __inline__ void
-mutex_enter(kmutex_t *mp)
-{
-	ENTRY;
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
-
-	if (unlikely(in_atomic() && !current->exit_state)) {
-		spin_unlock(&mp->km_lock);
-		__CDEBUG_LIMIT(S_MUTEX, D_ERROR,
-			       "May schedule while atomic: %s/0x%08x/%d\n",
-		               current->comm, preempt_count(), current->pid);
-		SBUG();
-	}
-
-	spin_unlock(&mp->km_lock);
-
-	down(&mp->km_sem);
-
-	spin_lock(&mp->km_lock);
-	ASSERT(mp->km_owner == NULL);
-	mp->km_owner = current;
-	spin_unlock(&mp->km_lock);
-	EXIT;
-}
-
-/* Return 1 if we acquired the mutex, else zero.  */
-static __inline__ int
-mutex_tryenter(kmutex_t *mp)
-{
-	int rc;
-	ENTRY;
-
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
-
-	if (unlikely(in_atomic() && !current->exit_state)) {
-		spin_unlock(&mp->km_lock);
-		__CDEBUG_LIMIT(S_MUTEX, D_ERROR,
-			       "May schedule while atomic: %s/0x%08x/%d\n",
-		               current->comm, preempt_count(), current->pid);
-		SBUG();
-	}
-
-	spin_unlock(&mp->km_lock);
-	rc = down_trylock(&mp->km_sem); /* returns 0 if acquired */
-	if (rc == 0) {
-		spin_lock(&mp->km_lock);
-		ASSERT(mp->km_owner == NULL);
-		mp->km_owner = current;
-		spin_unlock(&mp->km_lock);
-		RETURN(1);
-	}
-
-	RETURN(0);
-}
-
-static __inline__ void
-mutex_exit(kmutex_t *mp)
-{
-	ENTRY;
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
-
-	ASSERT(mp->km_owner == current);
-	mp->km_owner = NULL;
-	spin_unlock(&mp->km_lock);
-	up(&mp->km_sem);
-	EXIT;
-}
-
-/* Return 1 if mutex is held by current process, else zero.  */
-static __inline__ int
-mutex_owned(kmutex_t *mp)
-{
-	int rc;
-	ENTRY;
-
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
-	rc = (mp->km_owner == current);
-	spin_unlock(&mp->km_lock);
-
-	RETURN(rc);
-}
-
-/* Return owner if mutex is owned, else NULL.  */
-static __inline__ kthread_t *
-mutex_owner(kmutex_t *mp)
-{
-	kthread_t *thr;
-	ENTRY;
-
-	ASSERT(mp);
-	ASSERT(mp->km_magic == KM_MAGIC);
-	spin_lock(&mp->km_lock);
-	thr = mp->km_owner;
-	spin_unlock(&mp->km_lock);
-
-	RETURN(thr);
-}
+#define mutex_tryenter(mp)	__mutex_tryenter(mp)
+#define mutex_enter(mp)		__mutex_enter(mp)
+#define mutex_exit(mp)		__mutex_exit(mp)
+#define mutex_owned(mp)		__mutex_owned(mp)
+#define mutex_owner(mp)		__spl_mutex_owner(mp)
+#define MUTEX_HELD(mp)		mutex_owned(mp)
 
 #ifdef	__cplusplus
 }
diff --git a/modules/spl/Makefile.in b/modules/spl/Makefile.in
index ff283dfd6..bd2a5f9f2 100644
--- a/modules/spl/Makefile.in
+++ b/modules/spl/Makefile.in
@@ -22,6 +22,7 @@ spl-objs += spl-kobj.o
 spl-objs += spl-module.o
 spl-objs += spl-generic.o
 spl-objs += spl-atomic.o
+spl-objs += spl-mutex.o
 
 splmodule := spl.ko
 splmoduledir := @kmoduledir@/kernel/lib/
diff --git a/modules/spl/spl-generic.c b/modules/spl/spl-generic.c
index 1aadb990e..99497dd51 100644
--- a/modules/spl/spl-generic.c
+++ b/modules/spl/spl-generic.c
@@ -2,6 +2,7 @@
 #include <sys/vmsystm.h>
 #include <sys/vnode.h>
 #include <sys/kmem.h>
+#include <sys/mutex.h>
 #include <sys/debug.h>
 #include <sys/proc.h>
 #include <linux/kmod.h>
@@ -99,21 +100,26 @@ static int __init spl_init(void)
 	if ((rc = kmem_init()))
 		GOTO(out , rc);
 
-	if ((rc = vn_init()))
-		GOTO(out2, rc);
+	if ((rc = spl_mutex_init()))
+		GOTO(out2 , rc);
 
-	if ((rc = proc_init()))
+	if ((rc = vn_init()))
 		GOTO(out3, rc);
 
+	if ((rc = proc_init()))
+		GOTO(out4, rc);
+
 	if ((rc = set_hostid()))
-		GOTO(out4, rc = -EADDRNOTAVAIL);
+		GOTO(out5, rc = -EADDRNOTAVAIL);
 
 	printk("SPL: Loaded Solaris Porting Layer v%s\n", VERSION);
 	RETURN(rc);
-out4:
+out5:
 	proc_fini();
-out3:
+out4:
 	vn_fini();
+out3:
+	spl_mutex_fini();
 out2:
 	kmem_fini();
 out:
diff --git a/modules/spl/spl-mutex.c b/modules/spl/spl-mutex.c
new file mode 100644
index 000000000..06a8f316b
--- /dev/null
+++ b/modules/spl/spl-mutex.c
@@ -0,0 +1,256 @@
+#include <sys/mutex.h>
+
+#ifdef DEBUG_SUBSYSTEM
+#undef DEBUG_SUBSYSTEM
+#endif
+
+#define DEBUG_SUBSYSTEM S_MUTEX
+
+/* Mutex implementation based on those found in Solaris.  This means
+ * they the MUTEX_DEFAULT type is an adaptive mutex.  When calling
+ * mutex_enter() your process will spin waiting for the lock if it's
+ * likely the lock will be free'd shortly.  If it looks like the
+ * lock will be held for a longer time we schedule and sleep waiting
+ * for it.  This determination is made by checking if the holder of
+ * the lock is currently running on cpu or sleeping waiting to be
+ * scheduled.  If the holder is currently running it's likely the
+ * lock will be shortly dropped.
+ *
+ * XXX: This is basically a rough implementation to see if this
+ * helps our performance.  If it does a more careful implementation
+ * should be done, perhaps in assembly.
+ */
+
+/*  0:         Never spin when trying to aquire lock
+ * -1:         Spin until aquired or holder yeilds without dropping lock
+ *  1-MAX_INT: Spin for N attempts before sleeping for lock
+ */
+int mutex_spin_max = 100;
+
+#ifdef DEBUG_MUTEX
+int mutex_stats[MUTEX_STATS_SIZE] = { 0 };
+DEFINE_MUTEX(mutex_stats_lock);
+LIST_HEAD(mutex_stats_list);
+#endif
+
+void
+__spl_mutex_init(kmutex_t *mp, char *name, int type, void *ibc)
+{
+	ASSERT(mp);
+	ASSERT(name);
+	ASSERT(ibc == NULL);
+	ASSERT(mp->km_magic != KM_MAGIC); /* Never double init */
+
+	mp->km_magic = KM_MAGIC;
+	mp->km_owner = NULL;
+	mp->km_name = NULL;
+	mp->km_name_size = strlen(name) + 1;
+
+	switch (type) {
+		case MUTEX_DEFAULT:
+			mp->km_type = MUTEX_ADAPTIVE;
+			break;
+		case MUTEX_SPIN:
+		case MUTEX_ADAPTIVE:
+			mp->km_type = type;
+			break;
+		default:
+			SBUG();
+	}
+
+	/* Semaphore kmem_alloc'ed to keep struct size down (<64b) */
+	mp->km_sem = kmem_alloc(sizeof(struct semaphore), KM_SLEEP);
+	if (mp->km_sem == NULL)
+		return;
+
+	mp->km_name = kmem_alloc(mp->km_name_size, KM_SLEEP);
+	if (mp->km_name == NULL) {
+		kmem_free(mp->km_sem, sizeof(struct semaphore));
+		return;
+	}
+
+	sema_init(mp->km_sem, 1);
+	strcpy(mp->km_name, name);
+
+#ifdef DEBUG_MUTEX
+	mp->km_stats = kmem_zalloc(sizeof(int) * MUTEX_STATS_SIZE, KM_SLEEP);
+        if (mp->km_stats == NULL) {
+		kmem_free(mp->km_name, mp->km_name_size);
+		kmem_free(mp->km_sem, sizeof(struct semaphore));
+		return;
+	}
+
+	mutex_lock(&mutex_stats_lock);
+	list_add_tail(&mp->km_list, &mutex_stats_list);
+	mutex_unlock(&mutex_stats_lock);
+#endif
+}
+EXPORT_SYMBOL(__spl_mutex_init);
+
+void
+__spl_mutex_destroy(kmutex_t *mp)
+{
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+
+#ifdef DEBUG_MUTEX
+	mutex_lock(&mutex_stats_lock);
+	list_del_init(&mp->km_list);
+	mutex_unlock(&mutex_stats_lock);
+
+	kmem_free(mp->km_stats, sizeof(int) * MUTEX_STATS_SIZE);
+#endif
+	kmem_free(mp->km_name, mp->km_name_size);
+	kmem_free(mp->km_sem, sizeof(struct semaphore));
+
+	memset(mp, KM_POISON, sizeof(*mp));
+}
+EXPORT_SYMBOL(__spl_mutex_destroy);
+
+/* Return 1 if we acquired the mutex, else zero.  */
+int
+__mutex_tryenter(kmutex_t *mp)
+{
+	int rc;
+	ENTRY;
+
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+	MUTEX_STAT_INC(mutex_stats, MUTEX_TRYENTER_TOTAL);
+	MUTEX_STAT_INC(mp->km_stats, MUTEX_TRYENTER_TOTAL);
+
+	rc = down_trylock(mp->km_sem);
+	if (rc == 0) {
+		ASSERT(mp->km_owner == NULL);
+		mp->km_owner = current;
+		MUTEX_STAT_INC(mutex_stats, MUTEX_TRYENTER_NOT_HELD);
+		MUTEX_STAT_INC(mp->km_stats, MUTEX_TRYENTER_NOT_HELD);
+	}
+
+	RETURN(!rc);
+}
+EXPORT_SYMBOL(__mutex_tryenter);
+
+static void
+mutex_enter_adaptive(kmutex_t *mp)
+{
+	struct task_struct *owner;
+	int count = 0;
+
+	/* Lock is not held so we expect to aquire the lock */
+	if ((owner = mp->km_owner) == NULL) {
+		down(mp->km_sem);
+		MUTEX_STAT_INC(mutex_stats, MUTEX_ENTER_NOT_HELD);
+		MUTEX_STAT_INC(mp->km_stats, MUTEX_ENTER_NOT_HELD);
+	} else {
+		/* The lock is held by a currently running task which
+		 * we expect will drop the lock before leaving the
+		 * head of the runqueue.  So the ideal thing to do
+		 * is spin until we aquire the lock and avoid a
+		 * context switch.  However it is also possible the
+		 * task holding the lock yields the processor with
+		 * out dropping lock.  In which case, we know it's
+		 * going to be a while so we stop spinning and go
+		 * to sleep waiting for the lock to be available.
+		 * This should strike the optimum balance between
+		 * spinning and sleeping waiting for a lock.
+		 */
+		while (task_curr(owner) && (count <= mutex_spin_max)) {
+			if (down_trylock(mp->km_sem) == 0) {
+				MUTEX_STAT_INC(mutex_stats, MUTEX_ENTER_SPIN);
+				MUTEX_STAT_INC(mp->km_stats, MUTEX_ENTER_SPIN);
+				GOTO(out, count);
+			}
+			count++;
+		}
+
+		/* The lock is held by a sleeping task so it's going to
+		 * cost us minimally one context switch.  We might as
+		 * well sleep and yield the processor to other tasks.
+		 */
+		down(mp->km_sem);
+		MUTEX_STAT_INC(mutex_stats, MUTEX_ENTER_SLEEP);
+		MUTEX_STAT_INC(mp->km_stats, MUTEX_ENTER_SLEEP);
+	}
+out:
+	MUTEX_STAT_INC(mutex_stats, MUTEX_ENTER_TOTAL);
+	MUTEX_STAT_INC(mp->km_stats, MUTEX_ENTER_TOTAL);
+}
+
+void
+__mutex_enter(kmutex_t *mp)
+{
+	ENTRY;
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+
+	switch (mp->km_type) {
+		case MUTEX_SPIN:
+			while (down_trylock(mp->km_sem));
+			MUTEX_STAT_INC(mutex_stats, MUTEX_ENTER_SPIN);
+			MUTEX_STAT_INC(mp->km_stats, MUTEX_ENTER_SPIN);
+			break;
+		case MUTEX_ADAPTIVE:
+			mutex_enter_adaptive(mp);
+			break;
+	}
+
+	ASSERT(mp->km_owner == NULL);
+	mp->km_owner = current;
+
+	EXIT;
+}
+EXPORT_SYMBOL(__mutex_enter);
+
+void
+__mutex_exit(kmutex_t *mp)
+{
+	ENTRY;
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+	ASSERT(mp->km_owner == current);
+	mp->km_owner = NULL;
+	up(mp->km_sem);
+	EXIT;
+}
+EXPORT_SYMBOL(__mutex_exit);
+
+/* Return 1 if mutex is held by current process, else zero.  */
+int
+__mutex_owned(kmutex_t *mp)
+{
+	ENTRY;
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+	RETURN(mp->km_owner == current);
+}
+EXPORT_SYMBOL(__mutex_owned);
+
+/* Return owner if mutex is owned, else NULL.  */
+kthread_t *
+__spl_mutex_owner(kmutex_t *mp)
+{
+	ENTRY;
+	ASSERT(mp);
+	ASSERT(mp->km_magic == KM_MAGIC);
+	RETURN(mp->km_owner);
+}
+EXPORT_SYMBOL(__spl_mutex_owner);
+
+int
+spl_mutex_init(void)
+{
+	ENTRY;
+	RETURN(0);
+}
+
+void
+spl_mutex_fini(void)
+{
+        ENTRY;
+#ifdef DEBUG_MUTEX
+	ASSERT(list_empty(&mutex_stats_list));
+#endif
+        EXIT;
+}
+
diff --git a/modules/spl/spl-proc.c b/modules/spl/spl-proc.c
index 94dd937a1..64423c186 100644
--- a/modules/spl/spl-proc.c
+++ b/modules/spl/spl-proc.c
@@ -3,8 +3,10 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
+#include <sys/mutex.h>
 #include <sys/debug.h>
 #include "config.h"
 
@@ -18,10 +20,17 @@ static struct ctl_table_header *spl_header = NULL;
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;
 
-#define CTL_SPL 0x87
+#define CTL_SPL		0x87
+#define CTL_SPL_DEBUG	0x88
+#define CTL_SPL_MUTEX	0x89
+#define CTL_SPL_KMEM	0x90
+
 enum {
 	CTL_VERSION = 1,          /* Version */
-        CTL_DEBUG_SUBSYS,         /* Debug subsystem */
+	CTL_HOSTID,               /* Host id reported by /usr/bin/hostid */
+	CTL_HW_SERIAL,            /* Hardware serial number from hostid */
+
+	CTL_DEBUG_SUBSYS,         /* Debug subsystem */
         CTL_DEBUG_MASK,           /* Debug mask */
         CTL_DEBUG_PRINTK,         /* Force all messages to console */
         CTL_DEBUG_MB,             /* Debug buffer size */
@@ -31,19 +40,23 @@ enum {
         CTL_DEBUG_PATH,           /* Dump log location */
         CTL_DEBUG_DUMP,           /* Dump debug buffer to file */
         CTL_DEBUG_FORCE_BUG,      /* Hook to force a BUG */
-        CTL_CONSOLE_RATELIMIT,    /* Ratelimit console messages */
+        CTL_DEBUG_STACK_SIZE,     /* Max observed stack size */
+
+	CTL_CONSOLE_RATELIMIT,    /* Ratelimit console messages */
         CTL_CONSOLE_MAX_DELAY_CS, /* Max delay at which we skip messages */
         CTL_CONSOLE_MIN_DELAY_CS, /* Init delay at which we skip messages */
         CTL_CONSOLE_BACKOFF,      /* Delay increase factor */
-        CTL_STACK_SIZE,           /* Max observed stack size */
+
 #ifdef DEBUG_KMEM
         CTL_KMEM_KMEMUSED,        /* Crrently alloc'd kmem bytes */
         CTL_KMEM_KMEMMAX,         /* Max alloc'd by kmem bytes */
         CTL_KMEM_VMEMUSED,        /* Currently alloc'd vmem bytes */
         CTL_KMEM_VMEMMAX,         /* Max alloc'd by vmem bytes */
 #endif
-	CTL_HOSTID,               /* Host id reported by /usr/bin/hostid */
-	CTL_HW_SERIAL,            /* Hardware serial number from hostid */
+
+	CTL_MUTEX_STATS,          /* Global mutex statistics */
+	CTL_MUTEX_STATS_PER,      /* Per mutex statistics */
+	CTL_MUTEX_SPIN_MAX,       /* Maximum mutex spin iterations */
 };
 
 static int
@@ -368,21 +381,107 @@ proc_dohostid(struct ctl_table *table, int write, struct file *filp,
         RETURN(rc);
 }
 
-static struct ctl_table spl_table[] = {
-        /* NB No .strategy entries have been provided since
-         * sysctl(8) prefers to go via /proc for portability.
-         */
-        {
-                .ctl_name = CTL_VERSION,
-                .procname = "version",
-                .data     = spl_version,
-                .maxlen   = sizeof(spl_version),
-                .mode     = 0444,
-                .proc_handler = &proc_dostring,
-        },
+#ifdef DEBUG_MUTEX
+static void
+mutex_seq_show_headers(struct seq_file *f)
+{
+        seq_printf(f, "%-36s %-4s %-16s\t"
+                   "e_tot\te_nh\te_sp\te_sl\tte_tot\tte_nh\n",
+		   "name", "type", "owner");
+}
+
+static int
+mutex_seq_show(struct seq_file *f, void *p)
+{
+        kmutex_t *mp = p;
+	char t = 'X';
+        int i;
+
+	ASSERT(mp->km_magic == KM_MAGIC);
+
+	switch (mp->km_type) {
+		case MUTEX_DEFAULT:	t = 'D';	break;
+		case MUTEX_SPIN:	t = 'S';	break;
+		case MUTEX_ADAPTIVE:	t = 'A';	break;
+		default:
+			SBUG();
+	}
+        seq_printf(f, "%-36s %c    ", mp->km_name, t);
+	if (mp->km_owner)
+                seq_printf(f, "%p\t", mp->km_owner);
+	else
+                seq_printf(f, "%-16s\t", "<not held>");
+
+        for (i = 0; i < MUTEX_STATS_SIZE; i++)
+                seq_printf(f, "%d%c", mp->km_stats[i],
+                           (i + 1 == MUTEX_STATS_SIZE) ? '\n' : '\t');
+
+        return 0;
+}
+
+static void *
+mutex_seq_start(struct seq_file *f, loff_t *pos)
+{
+        struct list_head *p;
+        loff_t n = *pos;
+        ENTRY;
+
+        mutex_lock(&mutex_stats_lock);
+        if (!n)
+                mutex_seq_show_headers(f);
+
+        p = mutex_stats_list.next;
+        while (n--) {
+                p = p->next;
+                if (p == &mutex_stats_list)
+                        RETURN(NULL);
+        }
+
+        RETURN(list_entry(p, kmutex_t, km_list));
+}
+
+static void *
+mutex_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	kmutex_t *mp = p;
+        ENTRY;
+
+        ++*pos;
+        RETURN((mp->km_list.next == &mutex_stats_list) ?
+	       NULL : list_entry(mp->km_list.next, kmutex_t, km_list));
+}
+
+static void
+mutex_seq_stop(struct seq_file *f, void *v)
+{
+        mutex_unlock(&mutex_stats_lock);
+}
+
+static struct seq_operations mutex_seq_ops = {
+        .show  = mutex_seq_show,
+        .start = mutex_seq_start,
+        .next  = mutex_seq_next,
+        .stop  = mutex_seq_stop,
+};
+
+static int
+proc_mutex_open(struct inode *inode, struct file *filp)
+{
+        return seq_open(filp, &mutex_seq_ops);
+}
+
+static struct file_operations proc_mutex_operations = {
+        .open           = proc_mutex_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif /* DEBUG_MUTEX */
+
+static struct ctl_table spl_debug_table[] = {
         {
                 .ctl_name = CTL_DEBUG_SUBSYS,
-                .procname = "debug_subsystem",
+                .procname = "subsystem",
                 .data     = &spl_debug_subsys,
                 .maxlen   = sizeof(unsigned long),
                 .mode     = 0644,
@@ -390,7 +489,7 @@ static struct ctl_table spl_table[] = {
         },
         {
                 .ctl_name = CTL_DEBUG_MASK,
-                .procname = "debug_mask",
+                .procname = "mask",
                 .data     = &spl_debug_mask,
                 .maxlen   = sizeof(unsigned long),
                 .mode     = 0644,
@@ -398,7 +497,7 @@ static struct ctl_table spl_table[] = {
         },
         {
                 .ctl_name = CTL_DEBUG_PRINTK,
-                .procname = "debug_printk",
+                .procname = "printk",
                 .data     = &spl_debug_printk,
                 .maxlen   = sizeof(unsigned long),
                 .mode     = 0644,
@@ -406,13 +505,13 @@ static struct ctl_table spl_table[] = {
         },
         {
                 .ctl_name = CTL_DEBUG_MB,
-                .procname = "debug_mb",
+                .procname = "mb",
                 .mode     = 0644,
                 .proc_handler = &proc_debug_mb,
         },
         {
                 .ctl_name = CTL_DEBUG_BINARY,
-                .procname = "debug_binary",
+                .procname = "binary",
                 .data     = &spl_debug_binary,
                 .maxlen   = sizeof(int),
                 .mode     = 0644,
@@ -436,7 +535,7 @@ static struct ctl_table spl_table[] = {
         },
         {
                 .ctl_name = CTL_DEBUG_PATH,
-                .procname = "debug_path",
+                .procname = "path",
                 .data     = spl_debug_file_path,
                 .maxlen   = sizeof(spl_debug_file_path),
                 .mode     = 0644,
@@ -444,7 +543,7 @@ static struct ctl_table spl_table[] = {
         },
         {
                 .ctl_name = CTL_DEBUG_DUMP,
-                .procname = "debug_dump",
+                .procname = "dump",
                 .mode     = 0200,
                 .proc_handler = &proc_dump_kernel,
         },
@@ -483,14 +582,40 @@ static struct ctl_table spl_table[] = {
                 .proc_handler = &proc_console_backoff,
         },
         {
-                .ctl_name = CTL_STACK_SIZE,
+                .ctl_name = CTL_DEBUG_STACK_SIZE,
                 .procname = "stack_max",
                 .data     = &spl_debug_stack,
                 .maxlen   = sizeof(int),
                 .mode     = 0444,
                 .proc_handler = &proc_dointvec,
         },
+	{0},
+};
+
+#ifdef DEBUG_MUTEX
+static struct ctl_table spl_mutex_table[] = {
+        {
+                .ctl_name = CTL_MUTEX_STATS,
+                .procname = "stats",
+                .data     = &mutex_stats,
+                .maxlen   = sizeof(int) * MUTEX_STATS_SIZE,
+                .mode     = 0444,
+                .proc_handler = &proc_dointvec,
+        },
+        {
+                .ctl_name = CTL_MUTEX_SPIN_MAX,
+                .procname = "spin_max",
+                .data     = &mutex_spin_max,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec,
+        },
+	{0},
+};
+#endif /* DEBUG_MUTEX */
+
 #ifdef DEBUG_KMEM
+static struct ctl_table spl_kmem_table[] = {
         {
                 .ctl_name = CTL_KMEM_KMEMUSED,
                 .procname = "kmem_used",
@@ -527,7 +652,22 @@ static struct ctl_table spl_table[] = {
                 .mode     = 0444,
                 .proc_handler = &proc_doulongvec_minmax,
         },
-#endif
+	{0},
+};
+#endif /* DEBUG_MUTEX */
+
+static struct ctl_table spl_table[] = {
+        /* NB No .strategy entries have been provided since
+         * sysctl(8) prefers to go via /proc for portability.
+         */
+        {
+                .ctl_name = CTL_VERSION,
+                .procname = "version",
+                .data     = spl_version,
+                .maxlen   = sizeof(spl_version),
+                .mode     = 0444,
+                .proc_handler = &proc_dostring,
+        },
         {
                 .ctl_name = CTL_HOSTID,
                 .procname = "hostid",
@@ -544,10 +684,32 @@ static struct ctl_table spl_table[] = {
                 .mode     = 0444,
                 .proc_handler = &proc_dostring,
         },
+	{
+		.ctl_name = CTL_SPL_DEBUG,
+		.procname = "debug",
+		.mode     = 0555,
+		.child    = spl_debug_table,
+	},
+#ifdef DEBUG_MUTEX
+	{
+		.ctl_name = CTL_SPL_MUTEX,
+		.procname = "mutex",
+		.mode     = 0555,
+		.child    = spl_mutex_table,
+	},
+#endif
+#ifdef DEBUG_KMEM
+	{
+		.ctl_name = CTL_SPL_KMEM,
+		.procname = "kmem",
+		.mode     = 0555,
+		.child    = spl_kmem_table,
+	},
+#endif
         { 0 },
 };
 
-static struct ctl_table spl_dir_table[] = {
+static struct ctl_table spl_dir[] = {
         {
                 .ctl_name = CTL_SPL,
                 .procname = "spl",
@@ -563,9 +725,22 @@ proc_init(void)
         ENTRY;
 
 #ifdef CONFIG_SYSCTL
-        spl_header = register_sysctl_table(spl_dir_table, 0);
+        spl_header = register_sysctl_table(spl_dir, 0);
 	if (spl_header == NULL)
 		RETURN(-EUNATCH);
+
+#ifdef DEBUG_MUTEX
+	{
+                struct proc_dir_entry *entry = create_proc_entry("mutex_stats",
+								 0444, NULL);
+                if (entry) {
+                        entry->proc_fops = &proc_mutex_operations;
+                } else {
+                        unregister_sysctl_table(spl_header);
+                        RETURN(-EUNATCH);
+                }
+	}
+#endif /* DEBUG_MUTEX */
 #endif
         RETURN(0);
 }
@@ -577,6 +752,7 @@ proc_fini(void)
 
 #ifdef CONFIG_SYSCTL
         ASSERT(spl_header != NULL);
+        remove_proc_entry("mutex_stats", NULL);
         unregister_sysctl_table(spl_header);
 #endif
         EXIT;
diff --git a/modules/spl/spl-taskq.c b/modules/spl/spl-taskq.c
index ad9be695b..70deb0aea 100644
--- a/modules/spl/spl-taskq.c
+++ b/modules/spl/spl-taskq.c
@@ -106,7 +106,7 @@ task_done(taskq_t *tq, task_t *t)
 		t->t_id = 0;
 		t->t_func = NULL;
 		t->t_arg = NULL;
-                list_add(&t->t_list, &tq->tq_free_list);
+                list_add_tail(&t->t_list, &tq->tq_free_list);
 	} else {
 		task_free(tq, t);
 	}
@@ -209,7 +209,7 @@ __taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 
 
 	spin_lock(&t->t_lock);
-	list_add(&t->t_list, &tq->tq_pend_list);
+	list_add_tail(&t->t_list, &tq->tq_pend_list);
 	t->t_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
         t->t_func = func;
@@ -282,7 +282,7 @@ taskq_thread(void *args)
                 if (!list_empty(&tq->tq_pend_list)) {
                         t = list_entry(tq->tq_pend_list.next, task_t, t_list);
                         list_del_init(&t->t_list);
-			list_add(&t->t_list, &tq->tq_work_list);
+			list_add_tail(&t->t_list, &tq->tq_work_list);
                         tq->tq_nactive++;
 			spin_unlock_irq(&tq->tq_lock);