/*
 *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
 *  Copyright (C) 2007 The Regents of the University of California.
 *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 *  UCRL-CODE-235197
 *
 *  This file is part of the SPL, Solaris Porting Layer.
 *
 *  The SPL is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  option) any later version.
 *
 *  The SPL is distributed in the hope that it will be useful, but WITHOUT
 *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  Solaris Porting Layer (SPL) Condition Variables Implementation.
 */

#include <sys/condvar.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <linux/hrtimer.h>
#include <linux/compiler_compat.h>
#include <linux/mod_compat.h>

#include <linux/sched.h>
#include <linux/sched/signal.h>

#define	MAX_HRTIMEOUT_SLACK_US	1000
static unsigned int spl_schedule_hrtimeout_slack_us = 0;

static int
param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
{
	unsigned long val;
	int error;

	error = kstrtoul(buf, 0, &val);
	if (error)
		return (error);

	if (val > MAX_HRTIMEOUT_SLACK_US)
		return (-EINVAL);

	error = param_set_uint(buf, kp);
	if (error < 0)
		return (error);

	return (0);
}

module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
	param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
	"schedule_hrtimeout_range() delta/slack value in us, default(0)");

void
__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
{
	ASSERT(cvp);
	ASSERT(name == NULL);
	ASSERT(type == CV_DEFAULT);
	ASSERT(arg == NULL);

	cvp->cv_magic = CV_MAGIC;
	init_waitqueue_head(&cvp->cv_event);
	init_waitqueue_head(&cvp->cv_destroy);
	atomic_set(&cvp->cv_waiters, 0);
	atomic_set(&cvp->cv_refs, 1);
	cvp->cv_mutex = NULL;
}
EXPORT_SYMBOL(__cv_init);

static int
cv_destroy_wakeup(kcondvar_t *cvp)
{
	if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
		ASSERT(cvp->cv_mutex == NULL);
		ASSERT(!waitqueue_active(&cvp->cv_event));
		return (1);
	}

	return (0);
}

void
__cv_destroy(kcondvar_t *cvp)
{
	ASSERT(cvp);
	ASSERT(cvp->cv_magic == CV_MAGIC);

	cvp->cv_magic = CV_DESTROY;
	atomic_dec(&cvp->cv_refs);

	/* Block until all waiters are woken and references dropped. */
	while (cv_destroy_wakeup(cvp) == 0)
		wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);

	ASSERT3P(cvp->cv_mutex, ==, NULL);
	ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
	ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
	ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
}
EXPORT_SYMBOL(__cv_destroy);

static void
cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
{
	DEFINE_WAIT(wait);
	kmutex_t *m;

	ASSERT(cvp);
	ASSERT(mp);
	ASSERT(cvp->cv_magic == CV_MAGIC);
	ASSERT(mutex_owned(mp));
	atomic_inc(&cvp->cv_refs);

	m = READ_ONCE(cvp->cv_mutex);
	if (!m)
		m = xchg(&cvp->cv_mutex, mp);
	/* Ensure the same mutex is used by all callers */
	ASSERT(m == NULL || m == mp);

	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
	atomic_inc(&cvp->cv_waiters);

	/*
	 * Mutex should be dropped after prepare_to_wait() this
	 * ensures we're linked in to the waiters list and avoids the
	 * race where 'cvp->cv_waiters > 0' but the list is empty.
	 */
	mutex_exit(mp);
	if (io)
		io_schedule();
	else
		schedule();

	/* No more waiters a different mutex could be used */
	if (atomic_dec_and_test(&cvp->cv_waiters)) {
		/*
		 * This is set without any lock, so it's racy. But this is
		 * just for debug anyway, so make it best-effort
		 */
		cvp->cv_mutex = NULL;
		wake_up(&cvp->cv_destroy);
	}

	finish_wait(&cvp->cv_event, &wait);
	atomic_dec(&cvp->cv_refs);

	/*
	 * Hold mutex after we release the cvp, otherwise we could dead lock
	 * with a thread holding the mutex and call cv_destroy.
	 */
	mutex_enter(mp);
}

void
__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
{
	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
}
EXPORT_SYMBOL(__cv_wait);

void
__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
{
	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
}
EXPORT_SYMBOL(__cv_wait_io);

int
__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp)
{
	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1);

	return (signal_pending(current) ? 0 : 1);
}
EXPORT_SYMBOL(__cv_wait_io_sig);

int
__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
{
	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);

	return (signal_pending(current) ? 0 : 1);
}
EXPORT_SYMBOL(__cv_wait_sig);

void
__cv_wait_idle(kcondvar_t *cvp, kmutex_t *mp)
{
	sigset_t blocked, saved;

	sigfillset(&blocked);
	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
	(void) sigprocmask(SIG_SETMASK, &saved, NULL);
}
EXPORT_SYMBOL(__cv_wait_idle);

/*
 * 'expire_time' argument is an absolute wall clock time in jiffies.
 * Return value is time left (expire_time - now) or -1 if timeout occurred.
 */
static clock_t
__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
    int state, int io)
{
	DEFINE_WAIT(wait);
	kmutex_t *m;
	clock_t time_left;

	ASSERT(cvp);
	ASSERT(mp);
	ASSERT(cvp->cv_magic == CV_MAGIC);
	ASSERT(mutex_owned(mp));

	/* XXX - Does not handle jiffie wrap properly */
	time_left = expire_time - jiffies;
	if (time_left <= 0)
		return (-1);

	atomic_inc(&cvp->cv_refs);
	m = READ_ONCE(cvp->cv_mutex);
	if (!m)
		m = xchg(&cvp->cv_mutex, mp);
	/* Ensure the same mutex is used by all callers */
	ASSERT(m == NULL || m == mp);

	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
	atomic_inc(&cvp->cv_waiters);

	/*
	 * Mutex should be dropped after prepare_to_wait() this
	 * ensures we're linked in to the waiters list and avoids the
	 * race where 'cvp->cv_waiters > 0' but the list is empty.
	 */
	mutex_exit(mp);
	if (io)
		time_left = io_schedule_timeout(time_left);
	else
		time_left = schedule_timeout(time_left);

	/* No more waiters a different mutex could be used */
	if (atomic_dec_and_test(&cvp->cv_waiters)) {
		/*
		 * This is set without any lock, so it's racy. But this is
		 * just for debug anyway, so make it best-effort
		 */
		cvp->cv_mutex = NULL;
		wake_up(&cvp->cv_destroy);
	}

	finish_wait(&cvp->cv_event, &wait);
	atomic_dec(&cvp->cv_refs);

	/*
	 * Hold mutex after we release the cvp, otherwise we could dead lock
	 * with a thread holding the mutex and call cv_destroy.
	 */
	mutex_enter(mp);
	return (time_left > 0 ? 1 : -1);
}

int
__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
{
	return (__cv_timedwait_common(cvp, mp, exp_time,
	    TASK_UNINTERRUPTIBLE, 0));
}
EXPORT_SYMBOL(__cv_timedwait);

int
__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
{
	return (__cv_timedwait_common(cvp, mp, exp_time,
	    TASK_UNINTERRUPTIBLE, 1));
}
EXPORT_SYMBOL(__cv_timedwait_io);

int
__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
{
	int rc;

	rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0);
	return (signal_pending(current) ? 0 : rc);
}
EXPORT_SYMBOL(__cv_timedwait_sig);

int
__cv_timedwait_idle(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
{
	sigset_t blocked, saved;
	int rc;

	sigfillset(&blocked);
	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
	rc = __cv_timedwait_common(cvp, mp, exp_time,
	    TASK_INTERRUPTIBLE, 0);
	(void) sigprocmask(SIG_SETMASK, &saved, NULL);

	return (rc);
}
EXPORT_SYMBOL(__cv_timedwait_idle);
/*
 * 'expire_time' argument is an absolute clock time in nanoseconds.
 * Return value is time left (expire_time - now) or -1 if timeout occurred.
 */
static clock_t
__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
    hrtime_t res, int state)
{
	DEFINE_WAIT(wait);
	kmutex_t *m;
	hrtime_t time_left;
	ktime_t ktime_left;
	u64 slack = 0;
	int rc;

	ASSERT(cvp);
	ASSERT(mp);
	ASSERT(cvp->cv_magic == CV_MAGIC);
	ASSERT(mutex_owned(mp));

	time_left = expire_time - gethrtime();
	if (time_left <= 0)
		return (-1);

	atomic_inc(&cvp->cv_refs);
	m = READ_ONCE(cvp->cv_mutex);
	if (!m)
		m = xchg(&cvp->cv_mutex, mp);
	/* Ensure the same mutex is used by all callers */
	ASSERT(m == NULL || m == mp);

	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
	atomic_inc(&cvp->cv_waiters);

	/*
	 * Mutex should be dropped after prepare_to_wait() this
	 * ensures we're linked in to the waiters list and avoids the
	 * race where 'cvp->cv_waiters > 0' but the list is empty.
	 */
	mutex_exit(mp);

	ktime_left = ktime_set(0, time_left);
	slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
	    MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
	rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);

	/* No more waiters a different mutex could be used */
	if (atomic_dec_and_test(&cvp->cv_waiters)) {
		/*
		 * This is set without any lock, so it's racy. But this is
		 * just for debug anyway, so make it best-effort
		 */
		cvp->cv_mutex = NULL;
		wake_up(&cvp->cv_destroy);
	}

	finish_wait(&cvp->cv_event, &wait);
	atomic_dec(&cvp->cv_refs);

	mutex_enter(mp);
	return (rc == -EINTR ? 1 : -1);
}

/*
 * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
 */
static int
cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
    hrtime_t res, int flag, int state)
{
	if (!(flag & CALLOUT_FLAG_ABSOLUTE))
		tim += gethrtime();

	return (__cv_timedwait_hires(cvp, mp, tim, res, state));
}

int
cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
    int flag)
{
	return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
	    TASK_UNINTERRUPTIBLE));
}
EXPORT_SYMBOL(cv_timedwait_hires);

int
cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
    hrtime_t res, int flag)
{
	int rc;

	rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
	    TASK_INTERRUPTIBLE);
	return (signal_pending(current) ? 0 : rc);
}
EXPORT_SYMBOL(cv_timedwait_sig_hires);

int
cv_timedwait_idle_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
    hrtime_t res, int flag)
{
	sigset_t blocked, saved;
	int rc;

	sigfillset(&blocked);
	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
	rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
	    TASK_INTERRUPTIBLE);
	(void) sigprocmask(SIG_SETMASK, &saved, NULL);

	return (rc);
}
EXPORT_SYMBOL(cv_timedwait_idle_hires);

void
__cv_signal(kcondvar_t *cvp)
{
	ASSERT(cvp);
	ASSERT(cvp->cv_magic == CV_MAGIC);
	atomic_inc(&cvp->cv_refs);

	/*
	 * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
	 * waiter will be set runnable with each call to wake_up().
	 * Additionally wake_up() holds a spin_lock associated with
	 * the wait queue to ensure we don't race waking up processes.
	 */
	if (atomic_read(&cvp->cv_waiters) > 0)
		wake_up(&cvp->cv_event);

	atomic_dec(&cvp->cv_refs);
}
EXPORT_SYMBOL(__cv_signal);

void
__cv_broadcast(kcondvar_t *cvp)
{
	ASSERT(cvp);
	ASSERT(cvp->cv_magic == CV_MAGIC);
	atomic_inc(&cvp->cv_refs);

	/*
	 * Wake_up_all() will wake up all waiters even those which
	 * have the WQ_FLAG_EXCLUSIVE flag set.
	 */
	if (atomic_read(&cvp->cv_waiters) > 0)
		wake_up_all(&cvp->cv_event);

	atomic_dec(&cvp->cv_refs);
}
EXPORT_SYMBOL(__cv_broadcast);