mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-15 04:30:33 +03:00
d48091de81
This commit adds a new test case to the ZFS Test Suite to verify ZED can detect when a device is physically removed from a running system: the device will be offlined if a spare is not available in the pool. We implement this by using the existing libudev functionality and without relying solely on the FM kernel module capabilities which have been observed to be unreliable with some kernels. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Don Brady <don.brady@delphix.com> Signed-off-by: loli10K <ezomori.nozomu@gmail.com> Closes #1537 Closes #7926
422 lines
11 KiB
C
422 lines
11 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License Version 1.0 (CDDL-1.0).
|
|
* You can obtain a copy of the license from the top-level file
|
|
* "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
|
|
* You may not use this file except in compliance with the license.
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2016, Intel Corporation.
|
|
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
|
|
*/
|
|
|
|
#include <libnvpair.h>
|
|
#include <libzfs.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/list.h>
|
|
#include <sys/time.h>
|
|
#include <sys/sysevent/eventdefs.h>
|
|
#include <sys/sysevent/dev.h>
|
|
#include <sys/fm/protocol.h>
|
|
#include <sys/fm/fs/zfs.h>
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
|
|
#include "zfs_agents.h"
|
|
#include "fmd_api.h"
|
|
#include "../zed_log.h"
|
|
|
|
/*
|
|
* agent dispatch code
|
|
*/
|
|
|
|
static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
|
|
static list_t agent_events; /* list of pending events */
|
|
static int agent_exiting;
|
|
|
|
typedef struct agent_event {
|
|
char ae_class[64];
|
|
char ae_subclass[32];
|
|
nvlist_t *ae_nvl;
|
|
list_node_t ae_node;
|
|
} agent_event_t;
|
|
|
|
pthread_t g_agents_tid;
|
|
|
|
libzfs_handle_t *g_zfs_hdl;
|
|
|
|
/* guid search data */
|
|
typedef enum device_type {
|
|
DEVICE_TYPE_L2ARC, /* l2arc device */
|
|
DEVICE_TYPE_SPARE, /* spare device */
|
|
DEVICE_TYPE_PRIMARY /* any primary pool storage device */
|
|
} device_type_t;
|
|
|
|
typedef struct guid_search {
|
|
uint64_t gs_pool_guid;
|
|
uint64_t gs_vdev_guid;
|
|
char *gs_devid;
|
|
device_type_t gs_vdev_type;
|
|
uint64_t gs_vdev_expandtime; /* vdev expansion time */
|
|
} guid_search_t;
|
|
|
|
/*
|
|
* Walks the vdev tree recursively looking for a matching devid.
|
|
* Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
|
|
*/
|
|
static boolean_t
|
|
zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
|
|
{
|
|
guid_search_t *gsp = arg;
|
|
char *path = NULL;
|
|
uint_t c, children;
|
|
nvlist_t **child;
|
|
|
|
/*
|
|
* First iterate over any children.
|
|
*/
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* Iterate over any spares and cache devices
|
|
*/
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
|
|
&child, &children) == 0) {
|
|
for (c = 0; c < children; c++) {
|
|
if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
|
|
gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
|
|
return (B_TRUE);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* On a devid match, grab the vdev guid and expansion time, if any.
|
|
*/
|
|
if ((nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
|
|
(strcmp(gsp->gs_devid, path) == 0)) {
|
|
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
|
|
&gsp->gs_vdev_guid);
|
|
(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
|
|
&gsp->gs_vdev_expandtime);
|
|
return (B_TRUE);
|
|
}
|
|
|
|
return (B_FALSE);
|
|
}
|
|
|
|
static int
|
|
zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
|
|
{
|
|
guid_search_t *gsp = arg;
|
|
nvlist_t *config, *nvl;
|
|
|
|
/*
|
|
* For each vdev in this pool, look for a match by devid
|
|
*/
|
|
if ((config = zpool_get_config(zhp, NULL)) != NULL) {
|
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
|
|
&nvl) == 0) {
|
|
(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
|
|
}
|
|
}
|
|
/*
|
|
* if a match was found then grab the pool guid
|
|
*/
|
|
if (gsp->gs_vdev_guid) {
|
|
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
|
&gsp->gs_pool_guid);
|
|
}
|
|
|
|
zpool_close(zhp);
|
|
return (gsp->gs_vdev_guid != 0);
|
|
}
|
|
|
|
void
|
|
zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
|
|
{
|
|
agent_event_t *event;
|
|
|
|
if (subclass == NULL)
|
|
subclass = "";
|
|
|
|
event = malloc(sizeof (agent_event_t));
|
|
if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
|
|
if (event)
|
|
free(event);
|
|
return;
|
|
}
|
|
|
|
if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
|
|
class = EC_ZFS;
|
|
subclass = ESC_ZFS_VDEV_CHECK;
|
|
}
|
|
|
|
/*
|
|
* On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED
|
|
* ereport from vdev_disk layer after a hot unplug. Fortunately we
|
|
* get a EC_DEV_REMOVE from our disk monitor and it is a suitable
|
|
* proxy so we remap it here for the benefit of the diagnosis engine.
|
|
*/
|
|
if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
|
|
(strcmp(subclass, ESC_DISK) == 0) &&
|
|
(nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
|
|
nvlist_exists(nvl, DEV_IDENTIFIER))) {
|
|
nvlist_t *payload = event->ae_nvl;
|
|
struct timeval tv;
|
|
int64_t tod[2];
|
|
uint64_t pool_guid = 0, vdev_guid = 0;
|
|
guid_search_t search = { 0 };
|
|
device_type_t devtype = DEVICE_TYPE_PRIMARY;
|
|
|
|
class = "resource.fs.zfs.removed";
|
|
subclass = "";
|
|
|
|
(void) nvlist_add_string(payload, FM_CLASS, class);
|
|
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
|
|
(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
|
|
|
|
(void) gettimeofday(&tv, NULL);
|
|
tod[0] = tv.tv_sec;
|
|
tod[1] = tv.tv_usec;
|
|
(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
|
|
|
|
/*
|
|
* For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
|
|
* ZFS_EV_POOL_GUID may be missing so find them.
|
|
*/
|
|
(void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
|
|
&search.gs_devid);
|
|
(void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
|
|
pool_guid = search.gs_pool_guid;
|
|
vdev_guid = search.gs_vdev_guid;
|
|
devtype = search.gs_vdev_type;
|
|
|
|
/*
|
|
* We want to avoid reporting "remove" events coming from
|
|
* libudev for VDEVs which were expanded recently (10s) and
|
|
* avoid activating spares in response to partitions being
|
|
* deleted and created in rapid succession.
|
|
*/
|
|
if (search.gs_vdev_expandtime != 0 &&
|
|
search.gs_vdev_expandtime + 10 > tv.tv_sec) {
|
|
zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
|
|
"for recently expanded device '%s'", EC_DEV_REMOVE,
|
|
search.gs_devid);
|
|
goto out;
|
|
}
|
|
|
|
(void) nvlist_add_uint64(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
|
|
(void) nvlist_add_uint64(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
|
|
switch (devtype) {
|
|
case DEVICE_TYPE_L2ARC:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
|
|
VDEV_TYPE_L2CACHE);
|
|
break;
|
|
case DEVICE_TYPE_SPARE:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
|
|
break;
|
|
case DEVICE_TYPE_PRIMARY:
|
|
(void) nvlist_add_string(payload,
|
|
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
|
|
break;
|
|
}
|
|
|
|
zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
|
|
EC_DEV_REMOVE, class);
|
|
}
|
|
|
|
(void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
|
|
(void) strlcpy(event->ae_subclass, subclass,
|
|
sizeof (event->ae_subclass));
|
|
|
|
(void) pthread_mutex_lock(&agent_lock);
|
|
list_insert_tail(&agent_events, event);
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
|
|
out:
|
|
(void) pthread_cond_signal(&agent_cond);
|
|
}
|
|
|
|
static void
|
|
zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
|
|
{
|
|
/*
|
|
* The diagnosis engine subscribes to the following events.
|
|
* On illumos these subscriptions reside in:
|
|
* /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
|
|
*/
|
|
if (strstr(class, "ereport.fs.zfs.") != NULL ||
|
|
strstr(class, "resource.fs.zfs.") != NULL ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
|
|
fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
|
|
}
|
|
|
|
/*
|
|
* The retire agent subscribes to the following events.
|
|
* On illumos these subscriptions reside in:
|
|
* /usr/lib/fm/fmd/plugins/zfs-retire.conf
|
|
*
|
|
* NOTE: faults events come directly from our diagnosis engine
|
|
* and will not pass through the zfs kernel module.
|
|
*/
|
|
if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
|
|
strcmp(class, "resource.fs.zfs.removed") == 0 ||
|
|
strcmp(class, "resource.fs.zfs.statechange") == 0 ||
|
|
strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
|
|
fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
|
|
}
|
|
|
|
/*
|
|
* The SLM module only consumes disk events and vdev check events
|
|
*
|
|
* NOTE: disk events come directly from disk monitor and will
|
|
* not pass through the zfs kernel module.
|
|
*/
|
|
if (strstr(class, "EC_dev_") != NULL ||
|
|
strcmp(class, EC_ZFS) == 0) {
|
|
(void) zfs_slm_event(class, subclass, nvl);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Events are consumed and dispatched from this thread
|
|
* An agent can also post an event so event list lock
|
|
* is not held when calling an agent.
|
|
* One event is consumed at a time.
|
|
*/
|
|
static void *
|
|
zfs_agent_consumer_thread(void *arg)
|
|
{
|
|
for (;;) {
|
|
agent_event_t *event;
|
|
|
|
(void) pthread_mutex_lock(&agent_lock);
|
|
|
|
/* wait for an event to show up */
|
|
while (!agent_exiting && list_is_empty(&agent_events))
|
|
(void) pthread_cond_wait(&agent_cond, &agent_lock);
|
|
|
|
if (agent_exiting) {
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
|
|
"exiting");
|
|
return (NULL);
|
|
}
|
|
|
|
if ((event = (list_head(&agent_events))) != NULL) {
|
|
list_remove(&agent_events, event);
|
|
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
|
|
/* dispatch to all event subscribers */
|
|
zfs_agent_dispatch(event->ae_class, event->ae_subclass,
|
|
event->ae_nvl);
|
|
|
|
nvlist_free(event->ae_nvl);
|
|
free(event);
|
|
continue;
|
|
}
|
|
|
|
(void) pthread_mutex_unlock(&agent_lock);
|
|
}
|
|
|
|
return (NULL);
|
|
}
|
|
|
|
void
|
|
zfs_agent_init(libzfs_handle_t *zfs_hdl)
|
|
{
|
|
fmd_hdl_t *hdl;
|
|
|
|
g_zfs_hdl = zfs_hdl;
|
|
|
|
if (zfs_slm_init() != 0)
|
|
zed_log_die("Failed to initialize zfs slm");
|
|
zed_log_msg(LOG_INFO, "Add Agent: init");
|
|
|
|
hdl = fmd_module_hdl("zfs-diagnosis");
|
|
_zfs_diagnosis_init(hdl);
|
|
if (!fmd_module_initialized(hdl))
|
|
zed_log_die("Failed to initialize zfs diagnosis");
|
|
|
|
hdl = fmd_module_hdl("zfs-retire");
|
|
_zfs_retire_init(hdl);
|
|
if (!fmd_module_initialized(hdl))
|
|
zed_log_die("Failed to initialize zfs retire");
|
|
|
|
list_create(&agent_events, sizeof (agent_event_t),
|
|
offsetof(struct agent_event, ae_node));
|
|
|
|
if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
|
|
NULL) != 0) {
|
|
list_destroy(&agent_events);
|
|
zed_log_die("Failed to initialize agents");
|
|
}
|
|
}
|
|
|
|
void
|
|
zfs_agent_fini(void)
|
|
{
|
|
fmd_hdl_t *hdl;
|
|
agent_event_t *event;
|
|
|
|
agent_exiting = 1;
|
|
(void) pthread_cond_signal(&agent_cond);
|
|
|
|
/* wait for zfs_enum_pools thread to complete */
|
|
(void) pthread_join(g_agents_tid, NULL);
|
|
|
|
/* drain any pending events */
|
|
while ((event = (list_head(&agent_events))) != NULL) {
|
|
list_remove(&agent_events, event);
|
|
nvlist_free(event->ae_nvl);
|
|
free(event);
|
|
}
|
|
|
|
list_destroy(&agent_events);
|
|
|
|
if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
|
|
_zfs_retire_fini(hdl);
|
|
fmd_hdl_unregister(hdl);
|
|
}
|
|
if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
|
|
_zfs_diagnosis_fini(hdl);
|
|
fmd_hdl_unregister(hdl);
|
|
}
|
|
|
|
zed_log_msg(LOG_INFO, "Add Agent: fini");
|
|
zfs_slm_fini();
|
|
|
|
g_zfs_hdl = NULL;
|
|
}
|