From 8056a75672a57c85b8e10c0c6bce138146f7d213 Mon Sep 17 00:00:00 2001 From: Matthew Macy Date: Thu, 18 Jun 2020 10:17:50 -0700 Subject: [PATCH] Disambiguate condvar API contract On Illumos callers of cv_timedwait and cv_timedwait_hires can't distinguish between whether or not the cv was signaled or the call timed out. Illumos handles this (for some definition of handles) by calling cv_signal in the return path if we were signaled but the return value indicates instead that we timed out. This would make sense if it were possible to query the the cv for its net signal disposition. However, this isn't possible and, in spite of the fact that there are places in the code that clearly take a different and incompatible path if a timeout value is indicated, this distinction appears to be rather subtle to most developers. This problem is further compounded by the fact that on Linux, calling cv_signal in the return path wouldn't even do the right thing unless there are other waiters. Since it is possible for the caller to independently determine how much time is remaining but it is not possible to query if the cv was in fact signaled, prioritizing signalling over timeout seems like a cleaner solution. In addition, judging from usage patterns within the code itself, it is also less error prone. Reviewed-by: Jorgen Lundman Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Matt Macy Closes #10471 --- include/os/freebsd/spl/sys/condvar.h | 36 ++++++++++++++++---- include/os/linux/spl/sys/condvar.h | 50 ++++++++++++++++++++++------ include/sys/zfs_context.h | 4 +-- lib/libzpool/kernel.c | 4 +-- module/os/linux/spl/spl-condvar.c | 33 ++++++++++-------- module/zfs/zil.c | 4 +-- 6 files changed, 94 insertions(+), 37 deletions(-) diff --git a/include/os/freebsd/spl/sys/condvar.h b/include/os/freebsd/spl/sys/condvar.h index ff5f308e2..4375bd6a3 100644 --- a/include/os/freebsd/spl/sys/condvar.h +++ b/include/os/freebsd/spl/sys/condvar.h @@ -37,6 +37,31 @@ #include #include +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + static __inline sbintime_t zfs_nstosbt(int64_t _ns) { @@ -120,7 +145,7 @@ cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) #define cv_timedwait_io cv_timedwait #define cv_timedwait_sig_io cv_timedwait_sig -static inline clock_t +static inline int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { @@ -135,7 +160,6 @@ cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, if (hrtime >= tim) return (-1); - rc = cv_timedwait_sbt(cvp, mp, zfs_nstosbt(tim), zfs_nstosbt(res), C_ABSOLUTE); @@ -143,11 +167,10 @@ cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, return (-1); KASSERT(rc == 0, ("unexpected rc value %d", rc)); - hrtime = tim - gethrtime(); - return ((hrtime > 0) ? hrtime : -1); + return (1); } -static inline clock_t +static inline int cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { @@ -175,8 +198,7 @@ cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, return (0); default: KASSERT(rc == 0, ("unexpected rc value %d", rc)); - hrtime = tim - gethrtime(); - return ((hrtime > 0) ? hrtime : -1); + return (1); } } diff --git a/include/os/linux/spl/sys/condvar.h b/include/os/linux/spl/sys/condvar.h index f1438c4e2..22408824f 100644 --- a/include/os/linux/spl/sys/condvar.h +++ b/include/os/linux/spl/sys/condvar.h @@ -32,6 +32,32 @@ #include #include +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + + /* * The kcondvar_t struct is protected by mutex taken externally before * calling any of the wait/signal funs, and passed into the wait funs. @@ -56,12 +82,12 @@ extern void __cv_wait(kcondvar_t *, kmutex_t *); extern void __cv_wait_io(kcondvar_t *, kmutex_t *); extern int __cv_wait_io_sig(kcondvar_t *, kmutex_t *); extern int __cv_wait_sig(kcondvar_t *, kmutex_t *); -extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); -extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, +extern int __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); +extern int cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t res, int flag); -extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, +extern int cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t res, int flag); extern void __cv_signal(kcondvar_t *); extern void __cv_broadcast(kcondvar_t *c); @@ -72,12 +98,16 @@ extern void __cv_broadcast(kcondvar_t *c); #define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp) #define cv_wait_io_sig(cvp, mp) __cv_wait_io_sig(cvp, mp) #define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp) -#define cv_wait_interruptible(cvp, mp) cv_wait_sig(cvp, mp) -#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) -#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) -#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) -#define cv_timedwait_interruptible(cvp, mp, t) cv_timedwait_sig(cvp, mp, t) #define cv_signal(cvp) __cv_signal(cvp) #define cv_broadcast(cvp) __cv_broadcast(cvp) +/* + * NB: There is no way to reliably distinguish between having been signalled + * and having timed out on Linux. If the client code needs to reliably + * distinguish between the two it should use the hires variant. + */ +#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) +#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) +#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) + #endif /* _SPL_CONDVAR_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 72fcd74ca..7d5567db4 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -315,8 +315,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); -extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); -extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, +extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); +extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index d19ecc18f..f19ebac12 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -344,7 +344,7 @@ cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) return (1); } -clock_t +int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; @@ -378,7 +378,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) } /*ARGSUSED*/ -clock_t +int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c index 3cc33da62..9d045e3e8 100644 --- a/module/os/linux/spl/spl-condvar.c +++ b/module/os/linux/spl/spl-condvar.c @@ -301,10 +301,10 @@ __cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, * with a thread holding the mutex and call cv_destroy. */ mutex_enter(mp); - return (time_left > 0 ? time_left : -1); + return (time_left > 0 ? 1 : -1); } -clock_t +int __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { return (__cv_timedwait_common(cvp, mp, exp_time, @@ -312,7 +312,7 @@ __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) } EXPORT_SYMBOL(__cv_timedwait); -clock_t +int __cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { return (__cv_timedwait_common(cvp, mp, exp_time, @@ -320,11 +320,13 @@ __cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) } EXPORT_SYMBOL(__cv_timedwait_io); -clock_t +int __cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) { - return (__cv_timedwait_common(cvp, mp, exp_time, - TASK_INTERRUPTIBLE, 0)); + int rc; + + rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0); + return (signal_pending(current) ? 0 : rc); } EXPORT_SYMBOL(__cv_timedwait_sig); @@ -341,6 +343,7 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, hrtime_t time_left; ktime_t ktime_left; u64 slack = 0; + int rc; ASSERT(cvp); ASSERT(mp); @@ -371,7 +374,7 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, ktime_left = ktime_set(0, time_left); slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); - schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); + rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); /* No more waiters a different mutex could be used */ if (atomic_dec_and_test(&cvp->cv_waiters)) { @@ -387,14 +390,13 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, atomic_dec(&cvp->cv_refs); mutex_enter(mp); - time_left = expire_time - gethrtime(); - return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); + return (rc == -EINTR ? 1 : -1); } /* * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. */ -static clock_t +static int cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag, int state) { @@ -404,7 +406,7 @@ cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, return (__cv_timedwait_hires(cvp, mp, tim, res, state)); } -clock_t +int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { @@ -413,12 +415,15 @@ cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, } EXPORT_SYMBOL(cv_timedwait_hires); -clock_t +int cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { - return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, - TASK_INTERRUPTIBLE)); + int rc; + + rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE); + return (signal_pending(current) ? 0 : rc); } EXPORT_SYMBOL(cv_timedwait_sig_hires); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 34495af55..50b3b1434 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2687,11 +2687,11 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * timeout is reached; responsibility (2) from * the comment above this function. */ - clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv, + int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); - if (timeleft != -1 || zcw->zcw_done) + if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE;