Implement a proper rw_tryupgrade

Current rw_tryupgrade does rw_exit and then rw_tryenter(RW_RWITER), and then
does rw_enter(RW_READER) if it fails. This violate the assumption that
rw_tryupgrade should be atomic and could cause extra contention or even lock
inversion.

This patch we implement a proper rw_tryupgrade. For rwsem-spinlock, we take
the spinlock to check rwsem->count and rwsem->wait_list. For normal rwsem, we
use cmpxchg on rwsem->count to change the value from single reader to single
writer.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Closes zfsonlinux/zfs#4692
Closes #554
This commit is contained in:
Chunwei Chen 2016-05-25 16:35:42 -07:00 committed by Brian Behlendorf
parent c60a51b640
commit f58040c0fc
5 changed files with 147 additions and 13 deletions

View File

@ -39,6 +39,7 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [
SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
SPL_AC_SHRINK_CONTROL_STRUCT SPL_AC_SHRINK_CONTROL_STRUCT
SPL_AC_RWSEM_SPINLOCK_IS_RAW SPL_AC_RWSEM_SPINLOCK_IS_RAW
SPL_AC_RWSEM_ACTIVITY
SPL_AC_SCHED_RT_HEADER SPL_AC_SCHED_RT_HEADER
SPL_AC_2ARGS_VFS_GETATTR SPL_AC_2ARGS_VFS_GETATTR
SPL_AC_USLEEP_RANGE SPL_AC_USLEEP_RANGE
@ -1316,6 +1317,30 @@ AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [
EXTRA_KCFLAGS="$tmp_flags" EXTRA_KCFLAGS="$tmp_flags"
]) ])
dnl #
dnl # 3.16 API Change
dnl #
dnl # rwsem-spinlock "->activity" changed to "->count"
dnl #
AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [
AC_MSG_CHECKING([whether struct rw_semaphore has member activity])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="-Werror"
SPL_LINUX_TRY_COMPILE([
#include <linux/rwsem.h>
],[
struct rw_semaphore dummy_semaphore __attribute__ ((unused));
dummy_semaphore.activity = 0;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1,
[struct rw_semaphore has member activity])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])
dnl # dnl #
dnl # 3.9 API change, dnl # 3.9 API change,
dnl # Moved things from linux/sched.h to linux/sched/rt.h dnl # Moved things from linux/sched.h to linux/sched/rt.h

View File

@ -27,6 +27,23 @@
#include <linux/rwsem.h> #include <linux/rwsem.h>
#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
#define SPL_RWSEM_SINGLE_READER_VALUE (1)
#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1)
#else
#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS)
#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS)
#endif
/* Linux 3.16 change activity to count for rwsem-spinlock */
#ifdef HAVE_RWSEM_ACTIVITY
#define RWSEM_COUNT(sem) sem->activity
#else
#define RWSEM_COUNT(sem) sem->count
#endif
int rwsem_tryupgrade(struct rw_semaphore *rwsem);
#if defined(RWSEM_SPINLOCK_IS_RAW) #if defined(RWSEM_SPINLOCK_IS_RAW)
#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) #define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl)
#define spl_rwsem_unlock_irqrestore(lk, fl) raw_spin_unlock_irqrestore(lk, fl) #define spl_rwsem_unlock_irqrestore(lk, fl) raw_spin_unlock_irqrestore(lk, fl)

View File

@ -223,13 +223,10 @@ RW_LOCK_HELD(krwlock_t *rwp)
if (RW_WRITE_HELD(rwp)) { \ if (RW_WRITE_HELD(rwp)) { \
_rc_ = 1; \ _rc_ = 1; \
} else { \ } else { \
rw_exit(rwp); \ spl_rw_lockdep_off_maybe(rwp); \
if (rw_tryenter(rwp, RW_WRITER)) { \ if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \
_rc_ = 1; \ spl_rw_set_owner(rwp); \
} else { \ spl_rw_lockdep_on_maybe(rwp); \
rw_enter(rwp, RW_READER); \
_rc_ = 0; \
} \
} \ } \
_rc_; \ _rc_; \
}) })

View File

@ -32,5 +32,46 @@
#define DEBUG_SUBSYSTEM S_RWLOCK #define DEBUG_SUBSYSTEM S_RWLOCK
#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
int ret = 0;
unsigned long flags;
spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);
if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&
list_empty(&rwsem->wait_list)) {
ret = 1;
RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;
}
spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
return (ret);
}
#else
static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
typeof (rwsem->count) val;
val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
SPL_RWSEM_SINGLE_WRITER_VALUE);
return (val == SPL_RWSEM_SINGLE_READER_VALUE);
}
#endif
int
rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
if (__rwsem_tryupgrade(rwsem)) {
rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
rwsem->owner = current;
#endif
return (1);
}
return (0);
}
EXPORT_SYMBOL(rwsem_tryupgrade);
int spl_rw_init(void) { return 0; } int spl_rw_init(void) { return 0; }
void spl_rw_fini(void) { } void spl_rw_fini(void) { }

View File

@ -55,8 +55,12 @@
#define SPLAT_RWLOCK_TEST5_DESC "Write downgrade" #define SPLAT_RWLOCK_TEST5_DESC "Write downgrade"
#define SPLAT_RWLOCK_TEST6_ID 0x0706 #define SPLAT_RWLOCK_TEST6_ID 0x0706
#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade" #define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade-1"
#define SPLAT_RWLOCK_TEST6_DESC "Read upgrade" #define SPLAT_RWLOCK_TEST6_DESC "rwsem->count value"
#define SPLAT_RWLOCK_TEST7_ID 0x0707
#define SPLAT_RWLOCK_TEST7_NAME "rw_tryupgrade-2"
#define SPLAT_RWLOCK_TEST7_DESC "Read upgrade"
#define SPLAT_RWLOCK_TEST_MAGIC 0x115599DDUL #define SPLAT_RWLOCK_TEST_MAGIC 0x115599DDUL
#define SPLAT_RWLOCK_TEST_NAME "rwlock_test" #define SPLAT_RWLOCK_TEST_NAME "rwlock_test"
@ -580,8 +584,55 @@ splat_rwlock_test6(struct file *file, void *arg)
splat_init_rw_priv(rwp, file); splat_init_rw_priv(rwp, file);
rw_enter(&rwp->rw_rwlock, RW_READER); rw_enter(&rwp->rw_rwlock, RW_READER);
if (!RW_READ_HELD(&rwp->rw_rwlock)) { if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) !=
SPL_RWSEM_SINGLE_READER_VALUE) {
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME,
"We assumed single reader rwsem->count "
"should be %ld, but is %ld\n",
SPL_RWSEM_SINGLE_READER_VALUE,
RWSEM_COUNT(SEM(&rwp->rw_rwlock)));
rc = -ENOLCK;
goto out;
}
rw_exit(&rwp->rw_rwlock);
rw_enter(&rwp->rw_rwlock, RW_WRITER);
if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) !=
SPL_RWSEM_SINGLE_WRITER_VALUE) {
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME,
"We assumed single writer rwsem->count "
"should be %ld, but is %ld\n",
SPL_RWSEM_SINGLE_WRITER_VALUE,
RWSEM_COUNT(SEM(&rwp->rw_rwlock)));
rc = -ENOLCK;
goto out;
}
rc = 0;
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s",
"rwsem->count same as we assumed\n");
out:
rw_exit(&rwp->rw_rwlock);
rw_destroy(&rwp->rw_rwlock);
kfree(rwp);
return rc;
}
static int
splat_rwlock_test7(struct file *file, void *arg)
{
rw_priv_t *rwp;
int rc;
rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL);
if (rwp == NULL)
return -ENOMEM;
splat_init_rw_priv(rwp, file);
rw_enter(&rwp->rw_rwlock, RW_READER);
if (!RW_READ_HELD(&rwp->rw_rwlock)) {
splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME,
"rwlock should be read lock: %d\n", "rwlock should be read lock: %d\n",
RW_READ_HELD(&rwp->rw_rwlock)); RW_READ_HELD(&rwp->rw_rwlock));
rc = -ENOLCK; rc = -ENOLCK;
@ -591,7 +642,7 @@ splat_rwlock_test6(struct file *file, void *arg)
/* With one reader upgrade should never fail. */ /* With one reader upgrade should never fail. */
rc = rw_tryupgrade(&rwp->rw_rwlock); rc = rw_tryupgrade(&rwp->rw_rwlock);
if (!rc) { if (!rc) {
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME,
"rwlock failed upgrade from reader: %d\n", "rwlock failed upgrade from reader: %d\n",
RW_READ_HELD(&rwp->rw_rwlock)); RW_READ_HELD(&rwp->rw_rwlock));
rc = -ENOLCK; rc = -ENOLCK;
@ -599,7 +650,7 @@ splat_rwlock_test6(struct file *file, void *arg)
} }
if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) { if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) {
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "rwlock should " splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock should "
"have 0 (not %d) reader and 1 (not %d) writer\n", "have 0 (not %d) reader and 1 (not %d) writer\n",
RW_READ_HELD(&rwp->rw_rwlock), RW_READ_HELD(&rwp->rw_rwlock),
RW_WRITE_HELD(&rwp->rw_rwlock)); RW_WRITE_HELD(&rwp->rw_rwlock));
@ -607,7 +658,7 @@ splat_rwlock_test6(struct file *file, void *arg)
} }
rc = 0; rc = 0;
splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "%s",
"rwlock properly upgraded\n"); "rwlock properly upgraded\n");
out: out:
rw_exit(&rwp->rw_rwlock); rw_exit(&rwp->rw_rwlock);
@ -646,6 +697,8 @@ splat_rwlock_init(void)
SPLAT_RWLOCK_TEST5_ID, splat_rwlock_test5); SPLAT_RWLOCK_TEST5_ID, splat_rwlock_test5);
SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST6_NAME, SPLAT_RWLOCK_TEST6_DESC, SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST6_NAME, SPLAT_RWLOCK_TEST6_DESC,
SPLAT_RWLOCK_TEST6_ID, splat_rwlock_test6); SPLAT_RWLOCK_TEST6_ID, splat_rwlock_test6);
SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST7_NAME, SPLAT_RWLOCK_TEST7_DESC,
SPLAT_RWLOCK_TEST7_ID, splat_rwlock_test7);
return sub; return sub;
} }
@ -654,6 +707,7 @@ void
splat_rwlock_fini(splat_subsystem_t *sub) splat_rwlock_fini(splat_subsystem_t *sub)
{ {
ASSERT(sub); ASSERT(sub);
SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST7_ID);
SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST6_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST6_ID);
SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST5_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST5_ID);
SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST4_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST4_ID);