mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
zed: Ensure spare activation after kernel-initiated device removal
In addition to hotplug events, the kernel may also mark a failing vdev as REMOVED. This was observed in a customer report and reproduced by forcing the NVMe host driver to disable the device after a failed reset due to command timeout. In such cases, the spare was not activated because the device had already transitioned to a REMOVED state before zed processed the event. To address this, explicitly attempt hot spare activation when the kernel marks a device as REMOVED. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #17187
This commit is contained in:
+12
-7
@@ -8921,7 +8921,7 @@ spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart,
|
||||
*/
|
||||
|
||||
static void
|
||||
spa_async_remove(spa_t *spa, vdev_t *vd)
|
||||
spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
|
||||
{
|
||||
if (vd->vdev_remove_wanted) {
|
||||
vd->vdev_remove_wanted = B_FALSE;
|
||||
@@ -8941,11 +8941,11 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
|
||||
vdev_state_dirty(vd->vdev_top);
|
||||
|
||||
/* Tell userspace that the vdev is gone. */
|
||||
zfs_post_remove(spa, vd);
|
||||
zfs_post_remove(spa, vd, by_kernel);
|
||||
}
|
||||
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
spa_async_remove(spa, vd->vdev_child[c]);
|
||||
spa_async_remove(spa, vd->vdev_child[c], by_kernel);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -9039,13 +9039,18 @@ spa_async_thread(void *arg)
|
||||
/*
|
||||
* See if any devices need to be marked REMOVED.
|
||||
*/
|
||||
if (tasks & SPA_ASYNC_REMOVE) {
|
||||
if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) {
|
||||
boolean_t by_kernel = B_TRUE;
|
||||
if (tasks & SPA_ASYNC_REMOVE_BY_USER)
|
||||
by_kernel = B_FALSE;
|
||||
spa_vdev_state_enter(spa, SCL_NONE);
|
||||
spa_async_remove(spa, spa->spa_root_vdev);
|
||||
spa_async_remove(spa, spa->spa_root_vdev, by_kernel);
|
||||
for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
|
||||
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
|
||||
spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i],
|
||||
by_kernel);
|
||||
for (int i = 0; i < spa->spa_spares.sav_count; i++)
|
||||
spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
|
||||
spa_async_remove(spa, spa->spa_spares.sav_vdevs[i],
|
||||
by_kernel);
|
||||
(void) spa_vdev_state_exit(spa, NULL, 0);
|
||||
}
|
||||
|
||||
|
||||
+1
-1
@@ -4271,7 +4271,7 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid)
|
||||
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
|
||||
|
||||
vd->vdev_remove_wanted = B_TRUE;
|
||||
spa_async_request(spa, SPA_ASYNC_REMOVE);
|
||||
spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER);
|
||||
|
||||
return (spa_vdev_state_exit(spa, vd, 0));
|
||||
}
|
||||
|
||||
+16
-2
@@ -1433,9 +1433,23 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
|
||||
* removal.
|
||||
*/
|
||||
void
|
||||
zfs_post_remove(spa_t *spa, vdev_t *vd)
|
||||
zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
|
||||
{
|
||||
zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
|
||||
nvlist_t *aux = NULL;
|
||||
|
||||
if (by_kernel) {
|
||||
/*
|
||||
* Add optional supplemental keys to payload
|
||||
*/
|
||||
aux = fm_nvlist_create(NULL);
|
||||
if (aux)
|
||||
fnvlist_add_boolean(aux, "by_kernel");
|
||||
}
|
||||
|
||||
zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux);
|
||||
|
||||
if (by_kernel && aux)
|
||||
fm_nvlist_destroy(aux, FM_NVA_FREE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user