mirror_zfs/module/spl/spl-rwlock.c

/*****************************************************************************\
 *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 *  UCRL-CODE-235197
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *  For details, see <http://zfsonlinux.org/>.
 *
 *  The SPL is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  option) any later version.
 *
 *  The SPL is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 *****************************************************************************
 *  Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.
\*****************************************************************************/

#include <sys/rwlock.h>

#ifdef DEBUG_SUBSYSTEM
#undef DEBUG_SUBSYSTEM
#endif

#define DEBUG_SUBSYSTEM S_RWLOCK

#if defined(CONFIG_PREEMPT_RT_FULL)

#include <linux/rtmutex.h>
#define	RT_MUTEX_OWNER_MASKALL	1UL

static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{

	ASSERT((struct task_struct *)
	    ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) ==
	    current);

	/*
	 * Under the realtime patch series, rwsem is implemented as a
	 * single mutex held by readers and writers alike. However,
	 * this implementation would prevent a thread from taking a
	 * read lock twice, as the mutex would already be locked on
	 * the second attempt. Therefore the implementation allows a
	 * single thread to take a rwsem as read lock multiple times
	 * tracking that nesting as read_depth counter.
	 */
	if (rwsem->read_depth <= 1) {
		/*
		 * In case, the current thread has not taken the lock
		 * more than once as read lock, we can allow an
		 * upgrade to a write lock. rwsem_rt.h implements
		 * write locks as read_depth == 0.
		 */
		rwsem->read_depth = 0;
		return (1);
	}
	return (0);
}
#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
	int ret = 0;
	unsigned long flags;
	spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);
	if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&
	    list_empty(&rwsem->wait_list)) {
		ret = 1;
		RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;
	}
	spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
	return (ret);
}
#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
	long val;
	val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
	    SPL_RWSEM_SINGLE_WRITER_VALUE);
	return (val == SPL_RWSEM_SINGLE_READER_VALUE);
}
#else
static int
__rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
	typeof (rwsem->count) val;
	val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
	    SPL_RWSEM_SINGLE_WRITER_VALUE);
	return (val == SPL_RWSEM_SINGLE_READER_VALUE);
}
#endif

int
rwsem_tryupgrade(struct rw_semaphore *rwsem)
{
	if (__rwsem_tryupgrade(rwsem)) {
		rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
		rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
		rwsem->owner = current;
#endif
		return (1);
	}
	return (0);
}
EXPORT_SYMBOL(rwsem_tryupgrade);

int spl_rw_init(void) { return 0; }
void spl_rw_fini(void) { }
Public Release Prep Updated AUTHORS, COPYING, DISCLAIMER, and INSTALL files. Added standardized headers to all source file to clearly indicate the copyright, license, and to give credit where credit is due. 2010-05-18 02:18:00 +04:00			`/*****************************************************************************\`
			`* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.`
			`* Copyright (C) 2007 The Regents of the University of California.`
			`* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).`
			`* Written by Brian Behlendorf <behlendorf1@llnl.gov>.`
Go through and add a header with the proper UCRL number. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@114 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-05-26 08:38:26 +04:00			`* UCRL-CODE-235197`
			`*`
Public Release Prep Updated AUTHORS, COPYING, DISCLAIMER, and INSTALL files. Added standardized headers to all source file to clearly indicate the copyright, license, and to give credit where credit is due. 2010-05-18 02:18:00 +04:00			`* This file is part of the SPL, Solaris Porting Layer.`
Refresh links to web site Update links to refer to the official ZFS on Linux website instead of @behlendorf's personal fork on github. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> 2013-03-05 05:26:55 +04:00			`* For details, see <http://zfsonlinux.org/>.`
Go through and add a header with the proper UCRL number. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@114 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-05-26 08:38:26 +04:00			`*`
Public Release Prep Updated AUTHORS, COPYING, DISCLAIMER, and INSTALL files. Added standardized headers to all source file to clearly indicate the copyright, license, and to give credit where credit is due. 2010-05-18 02:18:00 +04:00			`* The SPL is free software; you can redistribute it and/or modify it`
			`* under the terms of the GNU General Public License as published by the`
			`* Free Software Foundation; either version 2 of the License, or (at your`
			`* option) any later version.`
			`*`
			`* The SPL is distributed in the hope that it will be useful, but WITHOUT`
Go through and add a header with the proper UCRL number. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@114 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-05-26 08:38:26 +04:00			`* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or`
			`* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
			`* for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License along`
Public Release Prep Updated AUTHORS, COPYING, DISCLAIMER, and INSTALL files. Added standardized headers to all source file to clearly indicate the copyright, license, and to give credit where credit is due. 2010-05-18 02:18:00 +04:00			`* with the SPL. If not, see <http://www.gnu.org/licenses/>.`
			`*****************************************************************************`
			`* Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.`
			`\*****************************************************************************/`
Go through and add a header with the proper UCRL number. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@114 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-05-26 08:38:26 +04:00
Reorganize /include/ to add a /sys/, this way we don't need to muck with #includes in existing Solaris style source to get it to find the right stuff. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@18 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-03-01 03:45:59 +03:00			`#include <sys/rwlock.h>`
Initial commit. All spl source written up to this point wrapped in an initial reasonable autoconf style build system. This does not yet build but the configure system does appear to work properly and integrate with the kernel. Hopefully the next commit gets us back to a buildable version we can run the test suite against. git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@1 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-02-26 23:36:04 +03:00
Update SPL to use new debug infrastructure. This means: - Replacing all BUG_ON()'s with proper ASSERT()'s - Using ENTRY,EXIT,GOTO, and RETURN macro to instument call paths git-svn-id: https://outreach.scidac.gov/svn/spl/trunk@78 7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c 2008-04-21 21:29:47 +04:00			`#ifdef DEBUG_SUBSYSTEM`
			`#undef DEBUG_SUBSYSTEM`
			`#endif`

			`#define DEBUG_SUBSYSTEM S_RWLOCK`

Add support for rw semaphore under PREEMPT_RT_FULL The main complication from the RT patch set is that the RW semaphore locks change such that read locks on an rwsem can be taken only by a single thread. All other threads are locked out. This single thread can take a read lock multiple times though. The underlying implementation changes to a mutex with an additional read_depth count. The implementation can be best understood by inspecting the RT patch. rwsem_rt.h and rt.c give the best insight into how RT rwsem works. My implementation for rwsem_tryupgrade is basically an inversion of rt_downgrade_write found in rt.c. Please see the comments in the code. Unfortunately, I have to drop SPLAT rwlock test4 completely as this test tries to take multiple locks from different threads, which RT rwsems do not support. Otherwise SPLAT, zconfig.sh, zpios-sanity.sh and zfs-tests.sh pass on my Debian-testing VM with the kernel linux-image-4.8.0-1-rt-amd64. Tested-by: kernelOfTruth <kerneloftruth@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Clemens Fruhwirth <clemens@endorphin.org> Closes zfsonlinux/zfs#5491 Closes #589 Closes #308 2016-12-17 19:09:57 +03:00			`#if defined(CONFIG_PREEMPT_RT_FULL)`

			`#include <linux/rtmutex.h>`
Reimplement rt_mutex_owner to fix build with DEBUG & PREEMPT_RT_FULL rt_mutex_owner is internal to kernel/locking/rtmutex_common.h and inaccessible for SPL via the public kernel headers. The way of accessing the owner has been stable since at least 3.13 ([1], [2]), which is masking the lowest bit in the owner pointer in rt_mutex. We do the same. [1] http://lxr.free-electrons.com/source/kernel/locking/rtmutex_common.h?v=3.13#L99 [2] http://lxr.free-electrons.com/source/kernel/locking/rtmutex_common.h?v=4.9#L78 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Clemens Fruhwirth <clemens@endorphin.org> Closes #593 2017-01-20 01:41:38 +03:00			`#define RT_MUTEX_OWNER_MASKALL 1UL`
Add support for rw semaphore under PREEMPT_RT_FULL The main complication from the RT patch set is that the RW semaphore locks change such that read locks on an rwsem can be taken only by a single thread. All other threads are locked out. This single thread can take a read lock multiple times though. The underlying implementation changes to a mutex with an additional read_depth count. The implementation can be best understood by inspecting the RT patch. rwsem_rt.h and rt.c give the best insight into how RT rwsem works. My implementation for rwsem_tryupgrade is basically an inversion of rt_downgrade_write found in rt.c. Please see the comments in the code. Unfortunately, I have to drop SPLAT rwlock test4 completely as this test tries to take multiple locks from different threads, which RT rwsems do not support. Otherwise SPLAT, zconfig.sh, zpios-sanity.sh and zfs-tests.sh pass on my Debian-testing VM with the kernel linux-image-4.8.0-1-rt-amd64. Tested-by: kernelOfTruth <kerneloftruth@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Clemens Fruhwirth <clemens@endorphin.org> Closes zfsonlinux/zfs#5491 Closes #589 Closes #308 2016-12-17 19:09:57 +03:00
			`static int`
			`__rwsem_tryupgrade(struct rw_semaphore *rwsem)`
			`{`
Reimplement rt_mutex_owner to fix build with DEBUG & PREEMPT_RT_FULL rt_mutex_owner is internal to kernel/locking/rtmutex_common.h and inaccessible for SPL via the public kernel headers. The way of accessing the owner has been stable since at least 3.13 ([1], [2]), which is masking the lowest bit in the owner pointer in rt_mutex. We do the same. [1] http://lxr.free-electrons.com/source/kernel/locking/rtmutex_common.h?v=3.13#L99 [2] http://lxr.free-electrons.com/source/kernel/locking/rtmutex_common.h?v=4.9#L78 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Clemens Fruhwirth <clemens@endorphin.org> Closes #593 2017-01-20 01:41:38 +03:00
			`ASSERT((struct task_struct *)`
			`((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) ==`
			`current);`
Add support for rw semaphore under PREEMPT_RT_FULL The main complication from the RT patch set is that the RW semaphore locks change such that read locks on an rwsem can be taken only by a single thread. All other threads are locked out. This single thread can take a read lock multiple times though. The underlying implementation changes to a mutex with an additional read_depth count. The implementation can be best understood by inspecting the RT patch. rwsem_rt.h and rt.c give the best insight into how RT rwsem works. My implementation for rwsem_tryupgrade is basically an inversion of rt_downgrade_write found in rt.c. Please see the comments in the code. Unfortunately, I have to drop SPLAT rwlock test4 completely as this test tries to take multiple locks from different threads, which RT rwsems do not support. Otherwise SPLAT, zconfig.sh, zpios-sanity.sh and zfs-tests.sh pass on my Debian-testing VM with the kernel linux-image-4.8.0-1-rt-amd64. Tested-by: kernelOfTruth <kerneloftruth@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Clemens Fruhwirth <clemens@endorphin.org> Closes zfsonlinux/zfs#5491 Closes #589 Closes #308 2016-12-17 19:09:57 +03:00
			`/*`
			`* Under the realtime patch series, rwsem is implemented as a`
			`* single mutex held by readers and writers alike. However,`
			`* this implementation would prevent a thread from taking a`
			`* read lock twice, as the mutex would already be locked on`
			`* the second attempt. Therefore the implementation allows a`
			`* single thread to take a rwsem as read lock multiple times`
			`* tracking that nesting as read_depth counter.`
			`*/`
			`if (rwsem->read_depth <= 1) {`
			`/*`
			`* In case, the current thread has not taken the lock`
			`* more than once as read lock, we can allow an`
			`* upgrade to a write lock. rwsem_rt.h implements`
			`* write locks as read_depth == 0.`
			`*/`
			`rwsem->read_depth = 0;`
			`return (1);`
			`}`
			`return (0);`
			`}`
			`#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)`
Implement a proper rw_tryupgrade Current rw_tryupgrade does rw_exit and then rw_tryenter(RW_RWITER), and then does rw_enter(RW_READER) if it fails. This violate the assumption that rw_tryupgrade should be atomic and could cause extra contention or even lock inversion. This patch we implement a proper rw_tryupgrade. For rwsem-spinlock, we take the spinlock to check rwsem->count and rwsem->wait_list. For normal rwsem, we use cmpxchg on rwsem->count to change the value from single reader to single writer. Signed-off-by: Chunwei Chen <david.chen@osnexus.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tim Chase <tim@chase2k.com> Closes zfsonlinux/zfs#4692 Closes #554 2016-05-26 02:35:42 +03:00			`static int`
			`__rwsem_tryupgrade(struct rw_semaphore *rwsem)`
			`{`
			`int ret = 0;`
			`unsigned long flags;`
			`spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);`
			`if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&`
			`list_empty(&rwsem->wait_list)) {`
			`ret = 1;`
			`RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;`
			`}`
			`spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);`
			`return (ret);`
			`}`
Linux 4.8 compat: rw_semaphore atomic_long_t count For non-rwsem-spinlocks the "count" member was changed from a "long" to "atomic_long_t" type. A configure check has been added to detect this change along with new versions of the _rwsem_tryupgrade() function and RWSEM_COUNT() macro. See https://github.com/torvalds/linux/commit/8ee62b18 for complete details. Signed-off-by: Tim Chase <tim@chase2k.com> Signed-off-by: Chunwei Chen <david.chen@osnexus.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #563 2016-07-27 02:37:46 +03:00			`#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)`
			`static int`
			`__rwsem_tryupgrade(struct rw_semaphore *rwsem)`
			`{`
			`long val;`
			`val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,`
			`SPL_RWSEM_SINGLE_WRITER_VALUE);`
			`return (val == SPL_RWSEM_SINGLE_READER_VALUE);`
			`}`
Implement a proper rw_tryupgrade Current rw_tryupgrade does rw_exit and then rw_tryenter(RW_RWITER), and then does rw_enter(RW_READER) if it fails. This violate the assumption that rw_tryupgrade should be atomic and could cause extra contention or even lock inversion. This patch we implement a proper rw_tryupgrade. For rwsem-spinlock, we take the spinlock to check rwsem->count and rwsem->wait_list. For normal rwsem, we use cmpxchg on rwsem->count to change the value from single reader to single writer. Signed-off-by: Chunwei Chen <david.chen@osnexus.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tim Chase <tim@chase2k.com> Closes zfsonlinux/zfs#4692 Closes #554 2016-05-26 02:35:42 +03:00			`#else`
			`static int`
			`__rwsem_tryupgrade(struct rw_semaphore *rwsem)`
			`{`
			`typeof (rwsem->count) val;`
			`val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,`
			`SPL_RWSEM_SINGLE_WRITER_VALUE);`
			`return (val == SPL_RWSEM_SINGLE_READER_VALUE);`
			`}`
			`#endif`

			`int`
			`rwsem_tryupgrade(struct rw_semaphore *rwsem)`
			`{`
			`if (__rwsem_tryupgrade(rwsem)) {`
			`rwsem_release(&rwsem->dep_map, 1, _RET_IP_);`
			`rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);`
			`#ifdef CONFIG_RWSEM_SPIN_ON_OWNER`
			`rwsem->owner = current;`
			`#endif`
			`return (1);`
			`}`
			`return (0);`
			`}`
			`EXPORT_SYMBOL(rwsem_tryupgrade);`

Update rwlocks to track owner to ensure correct semantics The behavior of RW__HELD was updated because it was not quite right. It is not sufficient to return non-zero when the lock is help, we must only do this when the current task in the holder. This means we need to track the lock owner which is not something tracked in a Linux semaphore. After some experimentation the solution I settled on was to embed the Linux semaphore at the start of a larger krwlock_t structure which includes the owner field. This maintains good performance and allows us to cleanly intergrate with the kernel lock analysis tools. My reasons: 1) By placing the Linux semaphore at the start of krwlock_t we can then simply cast krwlock_t to a rw_semaphore and pass that on to the linux kernel. This allows us to use '#defines so the preprocessor can do direct replacement of the Solaris primative with the linux equivilant. This is important because it then maintains the location information for each rw_ call point. 2) Additionally, by adding the owner to krwlock_t we can keep this needed extra information adjacent to the lock itself. This removes the need for a fancy lookup to get the owner which is optimal for performance. We can also leverage the existing spin lock in the semaphore to ensure owner is updated correctly. 3) All helper functions which do not need to strictly be implemented as a define to preserve location information can be done as a static inline function. 4) Adding the owner to krwlock_t allows us to remove all memory allocations done during lock initialization. This is good for all the obvious reasons, we do give up the ability to specific the lock name. The Linux profiling tools will stringify the lock name used in the code via the preprocessor and use that. Update rwlocks validated on: - SLES10 (ppc64) - SLES11 (x86_64) - CHAOS4.2 (x86_64) - RHEL5.3 (x86_64) - RHEL6 (x86_64) - FC11 (x86_64) 2009-09-26 01:14:35 +04:00			`int spl_rw_init(void) { return 0; }`
			`void spl_rw_fini(void) { }`