mirror_zfs/cmd/zed/zed_disk_event.c
Don Brady 976246fadd Add illumos FMD ZFS logic to ZED -- phase 2
The phase 2 work primarily entails the Diagnosis Engine and
the Retire Agent modules. It also includes infrastructure
to support a crude FMD environment to host these modules.

The Diagnosis Engine consumes I/O and checksum ereports and
feeds them into a SERD engine which will generate a corres-
ponding fault diagnosis when the SERD engine fires. All the
diagnosis state data is collected into cases, one case per
vdev being tracked.

The Retire Agent responds to diagnosed faults by isolating
the faulty VDEV. It will notify the ZFS kernel module of
the new VDEV state (degraded or faulted). This agent is
also responsible for managing hot spares across pools.
When it encounters a device fault or a device removal it
replaces the device with an appropriate spare if available.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@intel.com>
Closes #5343
2016-11-07 15:01:38 -08:00

391 lines
11 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License Version 1.0 (CDDL-1.0).
* You can obtain a copy of the license from the top-level file
* "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
* You may not use this file except in compliance with the license.
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2016, Intel Corporation.
*/
#ifdef HAVE_LIBUDEV
#include <errno.h>
#include <fcntl.h>
#include <libnvpair.h>
#include <libudev.h>
#include <libzfs.h>
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dev.h>
#include "zed_log.h"
#include "zed_disk_event.h"
#include "agents/zfs_agents.h"
/*
* Portions of ZED need to see disk events for disks belonging to ZFS pools.
* A libudev monitor is established to monitor block device actions and pass
* them on to internal ZED logic modules. Initially, zfs_mod.c is the only
* consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
* module responsible for handeling disk events for ZFS.
*/
pthread_t g_mon_tid;
struct udev *g_udev;
struct udev_monitor *g_mon;
#define DEV_BYID_PATH "/dev/disk/by-id/"
/* 64MB is minimum usable disk for ZFS */
#define MINIMUM_SECTORS 131072
/*
* Post disk event to SLM module
*
* occurs in the context of monitor thread
*/
static void
zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
{
char *strval;
uint64_t numval;
zed_log_msg(LOG_INFO, "zed_disk_event:");
zed_log_msg(LOG_INFO, "\tclass: %s", class);
zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
(void) zfs_agent_post_event(class, subclass, nvl);
}
/*
* dev_event_nvlist: place event schema into an nv pair list
*
* NAME VALUE (example)
* -------------- --------------------------------------------------------
* DEV_NAME /dev/sdl
* DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
* DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
* DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
* DEV_IS_PART ---
* DEV_SIZE 500107862016
* ZFS_EV_POOL_GUID 17523635698032189180
* ZFS_EV_VDEV_GUID 14663607734290803088
*/
static nvlist_t *
dev_event_nvlist(struct udev_device *dev)
{
nvlist_t *nvl;
char strval[128];
const char *value, *path;
uint64_t guid;
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (NULL);
if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
if ((path = udev_device_get_devnode(dev)) != NULL)
(void) nvlist_add_string(nvl, DEV_NAME, path);
if ((value = udev_device_get_devpath(dev)) != NULL)
(void) nvlist_add_string(nvl, DEV_PATH, value);
value = udev_device_get_devtype(dev);
if ((value != NULL && strcmp("partition", value) == 0) ||
(udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
!= NULL)) {
(void) nvlist_add_boolean(nvl, DEV_IS_PART);
}
if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
uint64_t numval = DEV_BSIZE;
numval *= strtoull(value, NULL, 10);
(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
}
/*
* Grab the pool and vdev guids from blkid cache
*/
value = udev_device_get_property_value(dev, "ID_FS_UUID");
if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
/*
* Either a vdev guid or a devid must be present for matching
*/
if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
!nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
nvlist_free(nvl);
return (NULL);
}
return (nvl);
}
/*
* Listen for block device uevents
*/
static void *
zed_udev_monitor(void *arg)
{
struct udev_monitor *mon = arg;
char *tmp, *tmp2;
zed_log_msg(LOG_INFO, "Waiting for new uduev disk events...");
while (1) {
struct udev_device *dev;
const char *action, *type, *part, *sectors;
const char *bus, *uuid;
const char *class, *subclass;
nvlist_t *nvl;
boolean_t is_zfs = B_FALSE;
/* allow a cancellation while blocked (recvmsg) */
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
/* blocks at recvmsg until an event occurs */
if ((dev = udev_monitor_receive_device(mon)) == NULL) {
zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
"device error %d", errno);
continue;
}
/* allow all steps to complete before a cancellation */
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
/*
* Strongly typed device is the prefered filter
*/
type = udev_device_get_property_value(dev, "ID_FS_TYPE");
if (type != NULL && type[0] != '\0') {
if (strcmp(type, "zfs_member") == 0) {
is_zfs = B_TRUE;
} else {
/* not ours, so skip */
zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
"%s (in use by %s)",
udev_device_get_devnode(dev), type);
udev_device_unref(dev);
continue;
}
}
/*
* if this is a disk and it is partitioned, then the
* zfs label will reside in a DEVTYPE=partition and
* we can skip passing this event
*/
type = udev_device_get_property_value(dev, "DEVTYPE");
part = udev_device_get_property_value(dev,
"ID_PART_TABLE_TYPE");
if (type != NULL && type[0] != '\0' &&
strcmp(type, "disk") == 0 &&
part != NULL && part[0] != '\0') {
/* skip and wait for partition event */
udev_device_unref(dev);
continue;
}
/*
* ignore small partitions
*/
sectors = udev_device_get_property_value(dev,
"ID_PART_ENTRY_SIZE");
if (sectors == NULL)
sectors = udev_device_get_sysattr_value(dev, "size");
if (sectors != NULL &&
strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
udev_device_unref(dev);
continue;
}
/*
* If the blkid probe didn't find ZFS, then a persistent
* device id string is required in the message schema
* for matching with vdevs. Preflight here for expected
* udev information.
*/
bus = udev_device_get_property_value(dev, "ID_BUS");
uuid = udev_device_get_property_value(dev, "DM_UUID");
if (!is_zfs && (bus == NULL && uuid == NULL)) {
zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
"source", udev_device_get_devnode(dev));
udev_device_unref(dev);
continue;
}
action = udev_device_get_action(dev);
if (strcmp(action, "add") == 0) {
class = EC_DEV_ADD;
subclass = ESC_DISK;
} else if (strcmp(action, "remove") == 0) {
class = EC_DEV_REMOVE;
subclass = ESC_DISK;
} else if (strcmp(action, "change") == 0) {
class = EC_DEV_STATUS;
subclass = ESC_DEV_DLE;
} else {
zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
action);
udev_device_unref(dev);
continue;
}
/*
* Special case an EC_DEV_ADD for multipath devices
*
* When a multipath device is created, udev reports the
* following:
*
* 1. "add" event of the dm device for the multipath device
* (like /dev/dm-3).
* 2. "change" event to create the actual multipath device
* symlink (like /dev/mapper/mpatha). The event also
* passes back the relevant DM vars we care about, like
* DM_UUID.
* 3. Another "change" event identical to #2 (that we ignore).
*
* To get the behavior we want, we treat the "change" event
* in #2 as a "add" event; as if "/dev/mapper/mpatha" was
* a new disk being added.
*/
if (strcmp(class, EC_DEV_STATUS) == 0 &&
udev_device_get_property_value(dev, "DM_UUID") &&
udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
tmp = (char *) udev_device_get_devnode(dev);
tmp2 = zfs_get_underlying_path(tmp);
if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
/*
* We have a real underlying device, which
* means that this multipath "change" event is
* an "add" event.
*
* If the multipath device and the underlying
* dev are the same name (i.e. /dev/dm-5), then
* there is no real underlying disk for this
* multipath device, and so this "change" event
* really is a multipath removal.
*/
class = EC_DEV_ADD;
subclass = ESC_DISK;
} else {
tmp = (char *)
udev_device_get_property_value(dev,
"DM_NR_VALID_PATHS");
/* treat as a multipath remove */
if (tmp != NULL && strcmp(tmp, "0") == 0) {
class = EC_DEV_REMOVE;
subclass = ESC_DISK;
}
}
free(tmp2);
}
if ((nvl = dev_event_nvlist(dev)) != NULL) {
zed_udev_event(class, subclass, nvl);
nvlist_free(nvl);
}
udev_device_unref(dev);
}
return (NULL);
}
int
zed_disk_event_init()
{
int fd, fflags;
if ((g_udev = udev_new()) == NULL) {
zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
return (-1);
}
/* Set up a udev monitor for block devices */
g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
"partition");
udev_monitor_enable_receiving(g_mon);
/* Make sure monitoring socket is blocking */
fd = udev_monitor_get_fd(g_mon);
if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
/* spawn a thread to monitor events */
if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
udev_monitor_unref(g_mon);
udev_unref(g_udev);
zed_log_msg(LOG_WARNING, "pthread_create failed");
return (-1);
}
zed_log_msg(LOG_INFO, "zed_disk_event_init");
return (0);
}
void
zed_disk_event_fini()
{
/* cancel monitor thread at recvmsg() */
(void) pthread_cancel(g_mon_tid);
(void) pthread_join(g_mon_tid, NULL);
/* cleanup udev resources */
udev_monitor_unref(g_mon);
udev_unref(g_udev);
zed_log_msg(LOG_INFO, "zed_disk_event_fini");
}
#else
#include "zed_disk_event.h"
int
zed_disk_event_init()
{
return (0);
}
void
zed_disk_event_fini()
{
}
#endif /* HAVE_LIBUDEV */