Use smaller default slack/delta value for schedule_hrtimeout_range()

For interrupt coalescing, cv_timedwait_hires() uses a 100us slack/delta
for calls to schedule_hrtimeout_range(). This 100us slack can be costly
for small writes.

This change improves small write performance by passing resolution `res`
parameter to schedule_hrtimeout_range() to be used as delta/slack. A new
tunable `spl_schedule_hrtimeout_slack_us` is added to preserve old
behavior when desired.

Performance observations on 8K recordsize filesystem:
- 8K random writes at 1-64 threads, up to 60% improvement for one thread
  and smaller gains as thread count increases. At >64 threads, 2-5%
  decrease in performance was observed.
- 8K sequential writes, similar 60% improvement for one thread and
  leveling out around 64 threads. At >64 threads, 5-10% decrease in
  performance was observed.
- 128K sequential write sees 1-5 for the 128K. No observed regression at
  high thread count.

Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on
VMware ESX.

Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Signed-off-by: Tony Nguyen <tony.nguyen@delphix.com>
Closes #9217
This commit is contained in:
Tony Nguyen 2019-08-28 15:56:54 -06:00 committed by Tony Hutter
parent f785ce65c1
commit 16f42e1b6d
2 changed files with 38 additions and 18 deletions

View File

@ -26,8 +26,10 @@
#include <sys/condvar.h> #include <sys/condvar.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/sysmacros.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/compiler_compat.h> #include <linux/compiler_compat.h>
#include <linux/mod_compat.h>
#include <linux/sched.h> #include <linux/sched.h>
@ -35,6 +37,34 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#endif #endif
#define MAX_HRTIMEOUT_SLACK_US 1000
unsigned int spl_schedule_hrtimeout_slack_us = 0;
static int
param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
{
unsigned long val;
int error;
error = kstrtoul(buf, 0, &val);
if (error)
return (error);
if (val > MAX_HRTIMEOUT_SLACK_US)
return (-EINVAL);
error = param_set_uint(buf, kp);
if (error < 0)
return (error);
return (0);
}
module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
"schedule_hrtimeout_range() delta/slack value in us, default(0)");
void void
__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
{ {
@ -304,12 +334,13 @@ EXPORT_SYMBOL(__cv_timedwait_sig);
*/ */
static clock_t static clock_t
__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
int state) hrtime_t res, int state)
{ {
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
kmutex_t *m; kmutex_t *m;
hrtime_t time_left; hrtime_t time_left;
ktime_t ktime_left; ktime_t ktime_left;
u64 slack = 0;
ASSERT(cvp); ASSERT(cvp);
ASSERT(mp); ASSERT(mp);
@ -336,13 +367,11 @@ __cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
* race where 'cvp->cv_waiters > 0' but the list is empty. * race where 'cvp->cv_waiters > 0' but the list is empty.
*/ */
mutex_exit(mp); mutex_exit(mp);
/*
* Allow a 100 us range to give kernel an opportunity to coalesce
* interrupts
*/
ktime_left = ktime_set(0, time_left); ktime_left = ktime_set(0, time_left);
schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
HRTIMER_MODE_REL); MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
/* No more waiters a different mutex could be used */ /* No more waiters a different mutex could be used */
if (atomic_dec_and_test(&cvp->cv_waiters)) { if (atomic_dec_and_test(&cvp->cv_waiters)) {
@ -369,19 +398,10 @@ static clock_t
cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
hrtime_t res, int flag, int state) hrtime_t res, int flag, int state)
{ {
if (res > 1) {
/*
* Align expiration to the specified resolution.
*/
if (flag & CALLOUT_FLAG_ROUNDUP)
tim += res - 1;
tim = (tim / res) * res;
}
if (!(flag & CALLOUT_FLAG_ABSOLUTE)) if (!(flag & CALLOUT_FLAG_ABSOLUTE))
tim += gethrtime(); tim += gethrtime();
return (__cv_timedwait_hires(cvp, mp, tim, state)); return (__cv_timedwait_hires(cvp, mp, tim, res, state));
} }
clock_t clock_t

View File

@ -672,7 +672,7 @@ mmp_thread(void *arg)
CALLB_CPR_SAFE_BEGIN(&cpr); CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv,
&mmp->mmp_thread_lock, next_time, USEC2NSEC(1), &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
CALLOUT_FLAG_ABSOLUTE); CALLOUT_FLAG_ABSOLUTE);
CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
} }