mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 19:19:32 +03:00
55c12724d3
ZED does not take any action for disk removal events if there is no spare VDEV available. Added zpool_vdev_remove_wanted() in libzfs and vdev_remove_wanted() in vdev.c to remove the VDEV through ZED on removal event. This means that if you are running zed and remove a disk, it will be properly marked as REMOVED. Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #13797
459 lines
12 KiB
C
459 lines
12 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License Version 1.0 (CDDL-1.0).
|
|
* You can obtain a copy of the license from the top-level file
|
|
* "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
|
|
* You may not use this file except in compliance with the license.
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2016, Intel Corporation.
|
|
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
|
|
* Copyright (c) 2021 Hewlett Packard Enterprise Development LP
|
|
*/
|
|
|
|
#include <libnvpair.h>
|
|
#include <libzfs.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/list.h>
|
|
#include <sys/time.h>
|
|
#include <sys/sysevent/eventdefs.h>
|
|
#include <sys/sysevent/dev.h>
|
|
#include <sys/fm/protocol.h>
|
|
#include <sys/fm/fs/zfs.h>
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
|
|
#include "zfs_agents.h"
|
|
#include "fmd_api.h"
|
|
#include "../zed_log.h"
|
|
|
|
/*
|
|
* agent dispatch code
|
|
*/
|
|
|
|
static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
|
|
static list_t agent_events; /* list of pending events */
|
|
static int agent_exiting;
|
|
|
|
typedef struct agent_event {
|
|
char ae_class[64];
|
|
char ae_subclass[32];
|
|
nvlist_t *ae_nvl;
|
|
list_node_t ae_node;
|
|
} agent_event_t;
|
|
|
|
pthread_t g_agents_tid;
|
|
|
|
libzfs_handle_t *g_zfs_hdl;
|
|
|
|
/* guid search data */
|
|
typedef enum device_type {
|
|
DEVICE_TYPE_L2ARC, /* l2arc device */
|
|
DEVICE_TYPE_SPARE, /* spare device */
|
|
DEVICE_TYPE_PRIMARY /* any primary pool storage device */
|
|
} device_type_t;
|
|
|
|
typedef struct guid_search {
|
|
uint64_t gs_pool_guid;
|
|
uint64_t gs_vdev_guid;
|
|
char *gs_devid;
|
|
device_type_t gs_vdev_type;
|
|
uint64_t gs_vdev_expandtime; /* vdev expansion time */
|
|
} guid_search_t;
|
|
|
|
/*
|
|
* Walks the vdev tree recursively looking for a matching devid.
|
|
* Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
|
|
*/
|
|
static boolean_t
|
|
zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
|
|
{
|
|
guid_search_t *gsp = arg;
|
|
char *path = NULL;
|
|
uint_t c, children;
|
|
nvlist_t **child;
|
|
uint64_t vdev_guid;
|
|
|
|
/*
|
|
* First iterate over any children.
|
|
*/
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* Iterate over any spares and cache devices
|
|
*/
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* On a devid match, grab the vdev guid and expansion time, if any.
|
|
*/
|
|
if (gsp->gs_devid != NULL &&
|
|
(nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
|
|
(strcmp(gsp->gs_devid, path) == 0)) {
|
|
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
|
|
&gsp->gs_vdev_guid);
|
|
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
|
|
&gsp->gs_vdev_expandtime);
|
|
return (B_TRUE);
|
|
}
|
|
/*
|
|
* Otherwise, on a vdev guid match, grab the devid and expansion
|
|
* time. The devid might be missing on removal since its not part
|
|
* of blkid cache and L2ARC VDEV does not contain pool guid in its
|
|
* blkid, so this is a special case for L2ARC VDEV.
|
|
*/
|
|
else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL &&
|
|
nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&
|
|
gsp->gs_vdev_guid == vdev_guid) {
|
|
(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID,
|
|
&gsp->gs_devid);
|
|
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
|
|
&gsp->gs_vdev_expandtime);
|
|
return (B_TRUE);
|
|
}
|
|
|
|
return (B_FALSE);
|
|
}
|
|
|
|
static int
|
|
zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
|
|
{
|
|
guid_search_t *gsp = arg;
|
|
nvlist_t *config, *nvl;
|
|
|
|
/*
|
|
* For each vdev in this pool, look for a match by devid
|
|
*/
|
|
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
|
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
|
|
&nvl) == 0) {
|
|
(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
|
|
}
|
|
}
|
|
/*
|
|
* if a match was found then grab the pool guid
|
|
*/
|
|
if (gsp->gs_vdev_guid && gsp->gs_devid) {
|
|
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
|
&gsp->gs_pool_guid);
|
|
}
|
|
|
|
zpool_close(zhp);
|
|
return (gsp->gs_vdev_guid != 0);
|
|
}
|
|
|
|
void
|
|
zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
|
|
{
|
|
agent_event_t *event;
|
|
|
|
if (subclass == NULL)
|
|
subclass = "";
|
|
|
|
event = malloc(sizeof (agent_event_t));
|
|
if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
|
|
if (event)
|
|
free(event);
|
|
return;
|
|
}
|
|
|
|
if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
|
|
class = EC_ZFS;
|
|
subclass = ESC_ZFS_VDEV_CHECK;
|
|
}
|
|
|
|
/*
|
|
* On Linux, we don't get the expected FM_RESOURCE_REMOVED ereport
|
|
* from the vdev_disk layer after a hot unplug. Fortunately we do
|
|
* get an EC_DEV_REMOVE from our disk monitor and it is a suitable
|
|
* proxy so we remap it here for the benefit of the diagnosis engine.
|
|
* Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa
|
|
* layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful.
|
|
*/
|
|
if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
|
|
(strcmp(subclass, ESC_DISK) == 0) &&
|
|
(nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
|
|
nvlist_exists(nvl, DEV_IDENTIFIER))) {
|
|
nvlist_t *payload = event->ae_nvl;
|
|
struct timeval tv;
|
|
int64_t tod[2];
|
|
uint64_t pool_guid = 0, vdev_guid = 0;
|
|
guid_search_t search = { 0 };
|
|
device_type_t devtype = DEVICE_TYPE_PRIMARY;
|
|
char *devid = NULL;
|
|
|
|
class = "resource.fs.zfs.removed";
|
|
subclass = "";
|
|
|
|
(void) nvlist_add_string(payload, FM_CLASS, class);
|
|
(void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid);
|
|
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
|
|
(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
|
|
|
|
(void) gettimeofday(&tv, NULL);
|
|
tod[0] = tv.tv_sec;
|
|
tod[1] = tv.tv_usec;
|
|
(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
|
|
|
|
/*
|
|
* If devid is missing but vdev_guid is available, find devid
|
|
* and pool_guid from vdev_guid.
|
|
* For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
|
|
* ZFS_EV_POOL_GUID may be missing so find them.
|
|
*/
|
|
if (devid == NULL || pool_guid == 0 || vdev_guid == 0) {
|
|
if (devid == NULL)
|
|
search.gs_vdev_guid = vdev_guid;
|
|
else
|
|
search.gs_devid = devid;
|
|
zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
|
|
if (devid == NULL)
|
|
devid = search.gs_devid;
|
|
if (pool_guid == 0)
|
|
pool_guid = search.gs_pool_guid;
|
|
if (vdev_guid == 0)
|
|
vdev_guid = search.gs_vdev_guid;
|
|
devtype = search.gs_vdev_type;
|
|
}
|
|
|
|
/*
|
|
* We want to avoid reporting "remove" events coming from
|
|
* libudev for VDEVs which were expanded recently (10s) and
|
|
* avoid activating spares in response to partitions being
|
|
* deleted and created in rapid succession.
|
|
*/
|
|
if (search.gs_vdev_expandtime != 0 &&
|
|
search.gs_vdev_expandtime + 10 > tv.tv_sec) {
|
|
zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
|
|
"for recently expanded device '%s'", EC_DEV_REMOVE,
|
|
devid);
|
|
fnvlist_free(payload);
|
|
free(event);
|
|
goto out;
|
|
}
|
|
|
|
(void) nvlist_add_uint64(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
|
|
(void) nvlist_add_uint64(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
|
|
switch (devtype) {
|
|
case DEVICE_TYPE_L2ARC:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
|
|
VDEV_TYPE_L2CACHE);
|
|
break;
|
|
case DEVICE_TYPE_SPARE:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
|
|
break;
|
|
case DEVICE_TYPE_PRIMARY:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
|
|
break;
|
|
}
|
|
|
|
zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
|
|
EC_DEV_REMOVE, class);
|
|
}
|
|
|
|
(void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
|
|
(void) strlcpy(event->ae_subclass, subclass,
|
|
sizeof (event->ae_subclass));
|
|
|
|
(void) pthread_mutex_lock(&agent_lock);
|
|
list_insert_tail(&agent_events, event);
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
|
|
out:
|
|
(void) pthread_cond_signal(&agent_cond);
|
|
}
|
|
|
|
static void
|
|
zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
|
|
{
|
|
/*
|
|
* The diagnosis engine subscribes to the following events.
|
|
* On illumos these subscriptions reside in:
|
|
* /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
|
|
*/
|
|
if (strstr(class, "ereport.fs.zfs.") != NULL ||
|
|
strstr(class, "resource.fs.zfs.") != NULL ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
|
|
fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
|
|
}
|
|
|
|
/*
|
|
* The retire agent subscribes to the following events.
|
|
* On illumos these subscriptions reside in:
|
|
* /usr/lib/fm/fmd/plugins/zfs-retire.conf
|
|
*
|
|
* NOTE: faults events come directly from our diagnosis engine
|
|
* and will not pass through the zfs kernel module.
|
|
*/
|
|
if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
|
|
strcmp(class, "resource.fs.zfs.removed") == 0 ||
|
|
strcmp(class, "resource.fs.zfs.statechange") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
|
|
fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
|
|
}
|
|
|
|
/*
|
|
* The SLM module only consumes disk events and vdev check events
|
|
*
|
|
* NOTE: disk events come directly from disk monitor and will
|
|
* not pass through the zfs kernel module.
|
|
*/
|
|
if (strstr(class, "EC_dev_") != NULL ||
|
|
strcmp(class, EC_ZFS) == 0) {
|
|
(void) zfs_slm_event(class, subclass, nvl);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Events are consumed and dispatched from this thread
|
|
* An agent can also post an event so event list lock
|
|
* is not held when calling an agent.
|
|
* One event is consumed at a time.
|
|
*/
|
|
static void *
|
|
zfs_agent_consumer_thread(void *arg)
|
|
{
|
|
(void) arg;
|
|
|
|
for (;;) {
|
|
agent_event_t *event;
|
|
|
|
(void) pthread_mutex_lock(&agent_lock);
|
|
|
|
/* wait for an event to show up */
|
|
while (!agent_exiting && list_is_empty(&agent_events))
|
|
(void) pthread_cond_wait(&agent_cond, &agent_lock);
|
|
|
|
if (agent_exiting) {
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
|
|
"exiting");
|
|
return (NULL);
|
|
}
|
|
|
|
if ((event = (list_head(&agent_events))) != NULL) {
|
|
list_remove(&agent_events, event);
|
|
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
|
|
/* dispatch to all event subscribers */
|
|
zfs_agent_dispatch(event->ae_class, event->ae_subclass,
|
|
event->ae_nvl);
|
|
|
|
nvlist_free(event->ae_nvl);
|
|
free(event);
|
|
continue;
|
|
}
|
|
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
}
|
|
|
|
return (NULL);
|
|
}
|
|
|
|
void
|
|
zfs_agent_init(libzfs_handle_t *zfs_hdl)
|
|
{
|
|
fmd_hdl_t *hdl;
|
|
|
|
g_zfs_hdl = zfs_hdl;
|
|
|
|
if (zfs_slm_init() != 0)
|
|
zed_log_die("Failed to initialize zfs slm");
|
|
zed_log_msg(LOG_INFO, "Add Agent: init");
|
|
|
|
hdl = fmd_module_hdl("zfs-diagnosis");
|
|
_zfs_diagnosis_init(hdl);
|
|
if (!fmd_module_initialized(hdl))
|
|
zed_log_die("Failed to initialize zfs diagnosis");
|
|
|
|
hdl = fmd_module_hdl("zfs-retire");
|
|
_zfs_retire_init(hdl);
|
|
if (!fmd_module_initialized(hdl))
|
|
zed_log_die("Failed to initialize zfs retire");
|
|
|
|
list_create(&agent_events, sizeof (agent_event_t),
|
|
offsetof(struct agent_event, ae_node));
|
|
|
|
if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
|
|
NULL) != 0) {
|
|
list_destroy(&agent_events);
|
|
zed_log_die("Failed to initialize agents");
|
|
}
|
|
pthread_setname_np(g_agents_tid, "agents");
|
|
}
|
|
|
|
void
|
|
zfs_agent_fini(void)
|
|
{
|
|
fmd_hdl_t *hdl;
|
|
agent_event_t *event;
|
|
|
|
agent_exiting = 1;
|
|
(void) pthread_cond_signal(&agent_cond);
|
|
|
|
/* wait for zfs_enum_pools thread to complete */
|
|
(void) pthread_join(g_agents_tid, NULL);
|
|
|
|
/* drain any pending events */
|
|
while ((event = (list_head(&agent_events))) != NULL) {
|
|
list_remove(&agent_events, event);
|
|
nvlist_free(event->ae_nvl);
|
|
free(event);
|
|
}
|
|
|
|
list_destroy(&agent_events);
|
|
|
|
if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
|
|
_zfs_retire_fini(hdl);
|
|
fmd_hdl_unregister(hdl);
|
|
}
|
|
if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
|
|
_zfs_diagnosis_fini(hdl);
|
|
fmd_hdl_unregister(hdl);
|
|
}
|
|
|
|
zed_log_msg(LOG_INFO, "Add Agent: fini");
|
|
zfs_slm_fini();
|
|
|
|
g_zfs_hdl = NULL;
|
|
}
|