mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 10:37:35 +03:00
deadlock between spa_errlog_lock and dp_config_rwlock
There is a lock order inversion deadlock between `spa_errlog_lock` and `dp_config_rwlock`: A thread in `spa_delete_dataset_errlog()` is running from a sync task. It is holding the `dp_config_rwlock` for writer (see `dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`. A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock` (see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as reader). Note that this was introduced by #12812. This commit address this by defining the lock ordering to be dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock. spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this order, and then process_error_block() and get_head_and_birth_txg() can verify that the dp_config_rwlock is already held. Additionally, a buffer overrun in `spa_get_errlog()` is corrected. Many code paths didn't check if `*count` got to zero, instead continuing to overwrite past the beginning of the userspace buffer at `uaddr`. Tested by having some errors in the pool (via `zinject -t data /path/to/file`), one thread running `zpool iostat 0.001`, and another thread runs `zfs destroy` (in a loop, although it hits the first time). This reproduces the problem easily without the fix, and works with the fix. Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: George Amanakis <gamanakis@gmail.com> Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #14239 Closes #14289
This commit is contained in:
+20
-26
@@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||
{
|
||||
zfs_cmd_t zc = {"\0"};
|
||||
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
||||
uint64_t count;
|
||||
zbookmark_phys_t *zb = NULL;
|
||||
int i;
|
||||
zbookmark_phys_t *buf;
|
||||
uint64_t buflen = 10000; /* approx. 1MB of RAM */
|
||||
|
||||
if (fnvlist_lookup_uint64(zhp->zpool_config,
|
||||
ZPOOL_CONFIG_ERRCOUNT) == 0)
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* Retrieve the raw error list from the kernel. If the number of errors
|
||||
* has increased, allocate more space and continue until we get the
|
||||
* entire list.
|
||||
* Retrieve the raw error list from the kernel. If it doesn't fit,
|
||||
* allocate a larger buffer and retry.
|
||||
*/
|
||||
count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
|
||||
if (count == 0)
|
||||
return (0);
|
||||
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
|
||||
count * sizeof (zbookmark_phys_t));
|
||||
zc.zc_nvlist_dst_size = count;
|
||||
(void) strcpy(zc.zc_name, zhp->zpool_name);
|
||||
for (;;) {
|
||||
buf = zfs_alloc(zhp->zpool_hdl,
|
||||
buflen * sizeof (zbookmark_phys_t));
|
||||
zc.zc_nvlist_dst = (uintptr_t)buf;
|
||||
zc.zc_nvlist_dst_size = buflen;
|
||||
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
|
||||
&zc) != 0) {
|
||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
||||
free(buf);
|
||||
if (errno == ENOMEM) {
|
||||
void *dst;
|
||||
|
||||
count = zc.zc_nvlist_dst_size;
|
||||
dst = zfs_alloc(zhp->zpool_hdl, count *
|
||||
sizeof (zbookmark_phys_t));
|
||||
zc.zc_nvlist_dst = (uintptr_t)dst;
|
||||
buflen *= 2;
|
||||
} else {
|
||||
return (zpool_standard_error_fmt(hdl, errno,
|
||||
dgettext(TEXT_DOMAIN, "errors: List of "
|
||||
@@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||
* _not_ copied as part of the process. So we point the start of our
|
||||
* array appropriate and decrement the total number of elements.
|
||||
*/
|
||||
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
|
||||
zc.zc_nvlist_dst_size;
|
||||
count -= zc.zc_nvlist_dst_size;
|
||||
zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
|
||||
uint64_t zblen = buflen - zc.zc_nvlist_dst_size;
|
||||
|
||||
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
||||
qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
||||
|
||||
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
||||
|
||||
/*
|
||||
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
|
||||
*/
|
||||
for (i = 0; i < count; i++) {
|
||||
for (uint64_t i = 0; i < zblen; i++) {
|
||||
nvlist_t *nv;
|
||||
|
||||
/* ignoring zb_blkid and zb_level for now */
|
||||
@@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||
nvlist_free(nv);
|
||||
}
|
||||
|
||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
||||
free(buf);
|
||||
return (0);
|
||||
|
||||
nomem:
|
||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
||||
free(buf);
|
||||
return (no_memory(zhp->zpool_hdl));
|
||||
}
|
||||
|
||||
|
||||
@@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
|
||||
{
|
||||
pool_scan_stat_t *ps = NULL;
|
||||
uint_t vsc, psc;
|
||||
uint64_t nerr;
|
||||
uint64_t suspended;
|
||||
uint64_t hostid = 0;
|
||||
uint64_t errata = 0;
|
||||
@@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
|
||||
* Persistent data errors.
|
||||
*/
|
||||
if (!isimport) {
|
||||
uint64_t nerr;
|
||||
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
|
||||
&nerr) == 0 && nerr != 0)
|
||||
return (ZPOOL_STATUS_CORRUPT_DATA);
|
||||
|
||||
Reference in New Issue
Block a user