From 0e4c830bc19766e860e760e10e0d59250f12cced Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 12:55:37 -0400 Subject: [PATCH] Cleanup: Use OpenSolaris functions to call scheduler In our codebase, `cond_resched() and `schedule()` are Linux kernel functions that have replaced the OpenSolaris `kpreempt()` functions in the codebase to such an extent that `kpreempt()` in zfs_context.h was broken. Nobody noticed because we did not actually use it. The header had defined `kpreempt()` as `yield()`, which works on OpenSolaris and Illumos where `sched_yield()` is a wrapper for `yield()`, but that does not work on any other platform. The FreeBSD platform specific code implemented shims for these, but the shim for `schedule()` forced us to wait, which is different than merely rescheduling to another thread as the original Linux code does, while the shim for `cond_resched()` had the same definition as its kernel kpreempt() shim. After studying this, I have concluded that we should reintroduce the kpreempt() function in platform independent code with the following definitions: - In the Linux kernel: kpreempt(unused) -> cond_resched() - In the FreeBSD kernel: kpreempt(unused) -> kern_yield(PRI_USER) - In userspace: kpreempt(unused) -> sched_yield() In userspace, nothing changes from this cleanup. In the kernels, the function `fm_fini()` will now call `kern_yield(PRI_USER)` on FreeBSD and `cond_resched()` on Linux. This is instead of `pause("schedule", 1)` on FreeBSD and `schedule()` on Linux. This makes our behavior consistent across platforms. Note that Linux's SPL continues to use `cond_resched()` and `schedule()`. However, those functions have been removed from both the FreeBSD code and userspace code. This should have the benefit of making it slightly easier to port the code to new platforms by making how things should be mapped less confusing. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13845 --- include/os/freebsd/spl/sys/disp.h | 2 ++ include/os/freebsd/spl/sys/timer.h | 2 -- include/os/freebsd/zfs/sys/zfs_context_os.h | 2 -- include/os/linux/spl/sys/disp.h | 4 +++- include/sys/zfs_context.h | 5 +++-- module/zfs/arc.c | 4 ++-- module/zfs/dnode.c | 6 +++--- module/zfs/fm.c | 2 +- module/zfs/spa_log_spacemap.c | 2 +- 9 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h index 2be1b76e4..d46a7d2c0 100644 --- a/include/os/freebsd/spl/sys/disp.h +++ b/include/os/freebsd/spl/sys/disp.h @@ -31,6 +31,8 @@ #include +#define KPREEMPT_SYNC (-1) + #define kpreempt(x) kern_yield(PRI_USER) #endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h index d4694bb7c..7ff77e9b1 100644 --- a/include/os/freebsd/spl/sys/timer.h +++ b/include/os/freebsd/spl/sys/timer.h @@ -33,6 +33,4 @@ #define usleep_range(wakeup, wakeupepsilon) \ pause_sbt("usleep_range", ustosbt(wakeup), \ ustosbt(wakeupepsilon - wakeup), 0) - -#define schedule() pause("schedule", 1) #endif diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h index 867199501..1ce723304 100644 --- a/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -45,8 +45,6 @@ #define HAVE_LARGE_STACKS 1 #endif -#define cond_resched() kern_yield(PRI_USER) - #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f)) diff --git a/include/os/linux/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h index e106d3c54..c8be6ffbf 100644 --- a/include/os/linux/spl/sys/disp.h +++ b/include/os/linux/spl/sys/disp.h @@ -26,7 +26,9 @@ #include -#define kpreempt(unused) schedule() +#define KPREEMPT_SYNC (-1) + +#define kpreempt(unused) cond_resched() #define kpreempt_disable() preempt_disable() #define kpreempt_enable() preempt_enable() diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index aa4f78789..83ed97fbe 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -219,7 +219,6 @@ typedef pthread_t kthread_t; #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) -#define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ @@ -248,9 +247,11 @@ extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) +#define KPREEMPT_SYNC (-1) + +#define kpreempt(x) sched_yield() #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) -#define cond_resched() sched_yield() /* * Mutexes diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 980dc60d0..b9969bff5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4165,7 +4165,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * this CPU are able to make progress, make a voluntary preemption * call here. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } @@ -10335,7 +10335,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; - cond_resched(); + kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 67fe1e2c9..ef27dfd40 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1142,7 +1142,7 @@ dnode_free_interior_slots(dnode_t *dn) while (!dnode_slots_tryenter(children, idx, slots)) { DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } dnode_set_slots(children, idx, slots, DN_SLOT_FREE); @@ -1423,7 +1423,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } /* @@ -1478,7 +1478,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } if (!dnode_check_slots_free(dnc, idx, slots)) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index e7a7ad583..bc13b5517 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1354,7 +1354,7 @@ fm_fini(void) zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); - schedule(); + kpreempt(KPREEMPT_SYNC); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 19e334916..4ecce8214 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1176,7 +1176,7 @@ spa_ld_log_sm_data(spa_t *spa) } /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); ASSERT0(sls->sls_nblocks); sls->sls_nblocks = space_map_nblocks(sls->sls_sm); spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;