mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-04-30 14:08:24 +03:00
Fix data race between zil_commit() and zil_suspend()
openzfsonwindows/openzfs#206 found that it is possible to trip `VERIFY(list_is_empty(&lwb->lwb_itxs))` when a `zil_commit()` is delayed by the scheduler long enough for a parallel `zil_suspend()` operation to exit `zil_commit_impl()`. This is a data race. To prevent this, we introduce a `zilog->zl_suspend_lock` rwlock to ensure that all outstanding `zil_commit()` operations finish before `zil_suspend()` begins and that subsequent operations fallback to `txg_wait_synced()` after `zil_suspend()` has begun. On `PREEMPT_RT` Linux kernels, the `rw_enter()` implementation suffers from writer starvation. This means that a ZIL intensive system can delay `zil_suspend()` indefinitely. This is a pre-existing problem that affects everything that uses rw locks, so it needs to be addressed in the SPL. However, builds against `PREEMPT_RT` Linux kernels are currently broken due to a GPL symbol issue (#11097), so we can safely disregard that issue for now. Reported-by: Arun KV <arun.kv@datacore.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu> Closes #14514
This commit is contained in:
parent
9fd5fedc67
commit
4c856fb333
@ -183,6 +183,7 @@ struct zilog {
|
|||||||
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
|
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
|
||||||
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
|
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
|
||||||
uint64_t zl_replaying_seq; /* current replay seq number */
|
uint64_t zl_replaying_seq; /* current replay seq number */
|
||||||
|
krwlock_t zl_suspend_lock; /* protects suspend count */
|
||||||
uint32_t zl_suspend; /* log suspend count */
|
uint32_t zl_suspend; /* log suspend count */
|
||||||
kcondvar_t zl_cv_suspend; /* log suspend completion */
|
kcondvar_t zl_cv_suspend; /* log suspend completion */
|
||||||
uint8_t zl_suspending; /* log is currently suspending */
|
uint8_t zl_suspending; /* log is currently suspending */
|
||||||
|
@ -3221,6 +3221,21 @@ zil_commit(zilog_t *zilog, uint64_t foid)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The ->zl_suspend_lock rwlock ensures that all in-flight
|
||||||
|
* zil_commit() operations finish before suspension begins and that
|
||||||
|
* no more begin. Without it, it is possible for the scheduler to
|
||||||
|
* preempt us right after the zilog->zl_suspend suspend check, run
|
||||||
|
* another thread that runs zil_suspend() and after the other thread
|
||||||
|
* has finished its call to zil_commit_impl(), resume this thread while
|
||||||
|
* zil is suspended. This can trigger an assertion failure in
|
||||||
|
* VERIFY(list_is_empty(&lwb->lwb_itxs)). If it is held, it means that
|
||||||
|
* `zil_suspend()` is executing in another thread, so we go to
|
||||||
|
* txg_wait_synced().
|
||||||
|
*/
|
||||||
|
if (!rw_tryenter(&zilog->zl_suspend_lock, RW_READER))
|
||||||
|
goto wait;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the ZIL is suspended, we don't want to dirty it by calling
|
* If the ZIL is suspended, we don't want to dirty it by calling
|
||||||
* zil_commit_itx_assign() below, nor can we write out
|
* zil_commit_itx_assign() below, nor can we write out
|
||||||
@ -3229,11 +3244,14 @@ zil_commit(zilog_t *zilog, uint64_t foid)
|
|||||||
* semantics, and avoid calling those functions altogether.
|
* semantics, and avoid calling those functions altogether.
|
||||||
*/
|
*/
|
||||||
if (zilog->zl_suspend > 0) {
|
if (zilog->zl_suspend > 0) {
|
||||||
|
rw_exit(&zilog->zl_suspend_lock);
|
||||||
|
wait:
|
||||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
zil_commit_impl(zilog, foid);
|
zil_commit_impl(zilog, foid);
|
||||||
|
rw_exit(&zilog->zl_suspend_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -3498,6 +3516,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
|
|||||||
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
|
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
|
||||||
cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
|
cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
|
||||||
|
|
||||||
|
rw_init(&zilog->zl_suspend_lock, NULL, RW_DEFAULT, NULL);
|
||||||
|
|
||||||
return (zilog);
|
return (zilog);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3537,6 +3557,8 @@ zil_free(zilog_t *zilog)
|
|||||||
cv_destroy(&zilog->zl_cv_suspend);
|
cv_destroy(&zilog->zl_cv_suspend);
|
||||||
cv_destroy(&zilog->zl_lwb_io_cv);
|
cv_destroy(&zilog->zl_lwb_io_cv);
|
||||||
|
|
||||||
|
rw_destroy(&zilog->zl_suspend_lock);
|
||||||
|
|
||||||
kmem_free(zilog, sizeof (zilog_t));
|
kmem_free(zilog, sizeof (zilog_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3664,11 +3686,14 @@ zil_suspend(const char *osname, void **cookiep)
|
|||||||
return (error);
|
return (error);
|
||||||
zilog = dmu_objset_zil(os);
|
zilog = dmu_objset_zil(os);
|
||||||
|
|
||||||
|
rw_enter(&zilog->zl_suspend_lock, RW_WRITER);
|
||||||
|
|
||||||
mutex_enter(&zilog->zl_lock);
|
mutex_enter(&zilog->zl_lock);
|
||||||
zh = zilog->zl_header;
|
zh = zilog->zl_header;
|
||||||
|
|
||||||
if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
|
if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
|
||||||
mutex_exit(&zilog->zl_lock);
|
mutex_exit(&zilog->zl_lock);
|
||||||
|
rw_exit(&zilog->zl_suspend_lock);
|
||||||
dmu_objset_rele(os, suspend_tag);
|
dmu_objset_rele(os, suspend_tag);
|
||||||
return (SET_ERROR(EBUSY));
|
return (SET_ERROR(EBUSY));
|
||||||
}
|
}
|
||||||
@ -3682,6 +3707,7 @@ zil_suspend(const char *osname, void **cookiep)
|
|||||||
if (cookiep == NULL && !zilog->zl_suspending &&
|
if (cookiep == NULL && !zilog->zl_suspending &&
|
||||||
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
|
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
|
||||||
mutex_exit(&zilog->zl_lock);
|
mutex_exit(&zilog->zl_lock);
|
||||||
|
rw_exit(&zilog->zl_suspend_lock);
|
||||||
dmu_objset_rele(os, suspend_tag);
|
dmu_objset_rele(os, suspend_tag);
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
@ -3690,6 +3716,7 @@ zil_suspend(const char *osname, void **cookiep)
|
|||||||
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
|
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
|
||||||
|
|
||||||
zilog->zl_suspend++;
|
zilog->zl_suspend++;
|
||||||
|
rw_exit(&zilog->zl_suspend_lock);
|
||||||
|
|
||||||
if (zilog->zl_suspend > 1) {
|
if (zilog->zl_suspend > 1) {
|
||||||
/*
|
/*
|
||||||
|
Loading…
Reference in New Issue
Block a user