mirror of
				https://git.proxmox.com/git/mirror_zfs.git
				synced 2025-10-26 18:05:04 +03:00 
			
		
		
		
	This enables ZED to auto-online vdevs that are not wholedisk managed by ZFS. Signed-off-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Don Brady <don.brady@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
		
			
				
	
	
		
			420 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			420 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * CDDL HEADER START
 | 
						|
 *
 | 
						|
 * The contents of this file are subject to the terms of the
 | 
						|
 * Common Development and Distribution License Version 1.0 (CDDL-1.0).
 | 
						|
 * You can obtain a copy of the license from the top-level file
 | 
						|
 * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
 | 
						|
 * You may not use this file except in compliance with the license.
 | 
						|
 *
 | 
						|
 * CDDL HEADER END
 | 
						|
 */
 | 
						|
 | 
						|
/*
 | 
						|
 * Copyright (c) 2016, 2017, Intel Corporation.
 | 
						|
 */
 | 
						|
 | 
						|
#ifdef HAVE_LIBUDEV
 | 
						|
 | 
						|
#include <errno.h>
 | 
						|
#include <fcntl.h>
 | 
						|
#include <libnvpair.h>
 | 
						|
#include <libudev.h>
 | 
						|
#include <libzfs.h>
 | 
						|
#include <libzutil.h>
 | 
						|
#include <pthread.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <string.h>
 | 
						|
 | 
						|
#include <sys/sysevent/eventdefs.h>
 | 
						|
#include <sys/sysevent/dev.h>
 | 
						|
 | 
						|
#include "zed_log.h"
 | 
						|
#include "zed_disk_event.h"
 | 
						|
#include "agents/zfs_agents.h"
 | 
						|
 | 
						|
/*
 | 
						|
 * Portions of ZED need to see disk events for disks belonging to ZFS pools.
 | 
						|
 * A libudev monitor is established to monitor block device actions and pass
 | 
						|
 * them on to internal ZED logic modules.  Initially, zfs_mod.c is the only
 | 
						|
 * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
 | 
						|
 * module responsible for handling disk events for ZFS.
 | 
						|
 */
 | 
						|
 | 
						|
pthread_t g_mon_tid;
 | 
						|
struct udev *g_udev;
 | 
						|
struct udev_monitor *g_mon;
 | 
						|
 | 
						|
 | 
						|
#define	DEV_BYID_PATH	"/dev/disk/by-id/"
 | 
						|
 | 
						|
/* 64MB is minimum usable disk for ZFS */
 | 
						|
#define	MINIMUM_SECTORS		131072
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * Post disk event to SLM module
 | 
						|
 *
 | 
						|
 * occurs in the context of monitor thread
 | 
						|
 */
 | 
						|
static void
 | 
						|
zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
 | 
						|
{
 | 
						|
	char *strval;
 | 
						|
	uint64_t numval;
 | 
						|
 | 
						|
	zed_log_msg(LOG_INFO, "zed_disk_event:");
 | 
						|
	zed_log_msg(LOG_INFO, "\tclass: %s", class);
 | 
						|
	zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
 | 
						|
	if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
 | 
						|
	if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
 | 
						|
	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
 | 
						|
	if (nvlist_lookup_boolean(nvl, DEV_IS_PART) == B_TRUE)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: B_TRUE", DEV_IS_PART);
 | 
						|
	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
 | 
						|
	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
 | 
						|
	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
 | 
						|
	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
 | 
						|
		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
 | 
						|
 | 
						|
	(void) zfs_agent_post_event(class, subclass, nvl);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * dev_event_nvlist: place event schema into an nv pair list
 | 
						|
 *
 | 
						|
 * NAME			VALUE (example)
 | 
						|
 * --------------	--------------------------------------------------------
 | 
						|
 * DEV_NAME		/dev/sdl
 | 
						|
 * DEV_PATH		/devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
 | 
						|
 * DEV_IDENTIFIER	ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
 | 
						|
 * DEV_PHYS_PATH	pci-0000:04:00.0-sas-0x4433221101000000-lun-0
 | 
						|
 * DEV_IS_PART		---
 | 
						|
 * DEV_SIZE		500107862016
 | 
						|
 * ZFS_EV_POOL_GUID	17523635698032189180
 | 
						|
 * ZFS_EV_VDEV_GUID	14663607734290803088
 | 
						|
 */
 | 
						|
static nvlist_t *
 | 
						|
dev_event_nvlist(struct udev_device *dev)
 | 
						|
{
 | 
						|
	nvlist_t *nvl;
 | 
						|
	char strval[128];
 | 
						|
	const char *value, *path;
 | 
						|
	uint64_t guid;
 | 
						|
 | 
						|
	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
 | 
						|
		return (NULL);
 | 
						|
 | 
						|
	if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
 | 
						|
		(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
 | 
						|
	if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
 | 
						|
		(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
 | 
						|
	if ((path = udev_device_get_devnode(dev)) != NULL)
 | 
						|
		(void) nvlist_add_string(nvl, DEV_NAME, path);
 | 
						|
	if ((value = udev_device_get_devpath(dev)) != NULL)
 | 
						|
		(void) nvlist_add_string(nvl, DEV_PATH, value);
 | 
						|
	value = udev_device_get_devtype(dev);
 | 
						|
	if ((value != NULL && strcmp("partition", value) == 0) ||
 | 
						|
	    (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
 | 
						|
	    != NULL)) {
 | 
						|
		(void) nvlist_add_boolean(nvl, DEV_IS_PART);
 | 
						|
	}
 | 
						|
	if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
 | 
						|
		uint64_t numval = DEV_BSIZE;
 | 
						|
 | 
						|
		numval *= strtoull(value, NULL, 10);
 | 
						|
		(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Grab the pool and vdev guids from blkid cache
 | 
						|
	 */
 | 
						|
	value = udev_device_get_property_value(dev, "ID_FS_UUID");
 | 
						|
	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
 | 
						|
		(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
 | 
						|
 | 
						|
	value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
 | 
						|
	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
 | 
						|
		(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Either a vdev guid or a devid must be present for matching
 | 
						|
	 */
 | 
						|
	if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
 | 
						|
	    !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
 | 
						|
		nvlist_free(nvl);
 | 
						|
		return (NULL);
 | 
						|
	}
 | 
						|
 | 
						|
	return (nvl);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  Listen for block device uevents
 | 
						|
 */
 | 
						|
static void *
 | 
						|
zed_udev_monitor(void *arg)
 | 
						|
{
 | 
						|
	struct udev_monitor *mon = arg;
 | 
						|
	char *tmp, *tmp2;
 | 
						|
 | 
						|
	zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
 | 
						|
 | 
						|
	while (1) {
 | 
						|
		struct udev_device *dev;
 | 
						|
		const char *action, *type, *part, *sectors;
 | 
						|
		const char *bus, *uuid;
 | 
						|
		const char *class, *subclass;
 | 
						|
		nvlist_t *nvl;
 | 
						|
		boolean_t is_zfs = B_FALSE;
 | 
						|
 | 
						|
		/* allow a cancellation while blocked (recvmsg) */
 | 
						|
		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
 | 
						|
 | 
						|
		/* blocks at recvmsg until an event occurs */
 | 
						|
		if ((dev = udev_monitor_receive_device(mon)) == NULL) {
 | 
						|
			zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
 | 
						|
			    "device error %d", errno);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		/* allow all steps to complete before a cancellation */
 | 
						|
		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Strongly typed device is the preferred filter
 | 
						|
		 */
 | 
						|
		type = udev_device_get_property_value(dev, "ID_FS_TYPE");
 | 
						|
		if (type != NULL && type[0] != '\0') {
 | 
						|
			if (strcmp(type, "zfs_member") == 0) {
 | 
						|
				is_zfs = B_TRUE;
 | 
						|
			} else {
 | 
						|
				/* not ours, so skip */
 | 
						|
				zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
 | 
						|
				    "%s (in use by %s)",
 | 
						|
				    udev_device_get_devnode(dev), type);
 | 
						|
				udev_device_unref(dev);
 | 
						|
				continue;
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		/*
 | 
						|
		 * if this is a disk and it is partitioned, then the
 | 
						|
		 * zfs label will reside in a DEVTYPE=partition and
 | 
						|
		 * we can skip passing this event
 | 
						|
		 */
 | 
						|
		type = udev_device_get_property_value(dev, "DEVTYPE");
 | 
						|
		part = udev_device_get_property_value(dev,
 | 
						|
		    "ID_PART_TABLE_TYPE");
 | 
						|
		if (type != NULL && type[0] != '\0' &&
 | 
						|
		    strcmp(type, "disk") == 0 &&
 | 
						|
		    part != NULL && part[0] != '\0') {
 | 
						|
			/* skip and wait for partition event */
 | 
						|
			udev_device_unref(dev);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		/*
 | 
						|
		 * ignore small partitions
 | 
						|
		 */
 | 
						|
		sectors = udev_device_get_property_value(dev,
 | 
						|
		    "ID_PART_ENTRY_SIZE");
 | 
						|
		if (sectors == NULL)
 | 
						|
			sectors = udev_device_get_sysattr_value(dev, "size");
 | 
						|
		if (sectors != NULL &&
 | 
						|
		    strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
 | 
						|
			udev_device_unref(dev);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		/*
 | 
						|
		 * If the blkid probe didn't find ZFS, then a persistent
 | 
						|
		 * device id string is required in the message schema
 | 
						|
		 * for matching with vdevs. Preflight here for expected
 | 
						|
		 * udev information.
 | 
						|
		 */
 | 
						|
		bus = udev_device_get_property_value(dev, "ID_BUS");
 | 
						|
		uuid = udev_device_get_property_value(dev, "DM_UUID");
 | 
						|
		if (!is_zfs && (bus == NULL && uuid == NULL)) {
 | 
						|
			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
 | 
						|
			    "source", udev_device_get_devnode(dev));
 | 
						|
			udev_device_unref(dev);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		action = udev_device_get_action(dev);
 | 
						|
		if (strcmp(action, "add") == 0) {
 | 
						|
			class = EC_DEV_ADD;
 | 
						|
			subclass = ESC_DISK;
 | 
						|
		} else if (strcmp(action, "remove") == 0) {
 | 
						|
			class = EC_DEV_REMOVE;
 | 
						|
			subclass = ESC_DISK;
 | 
						|
		} else if (strcmp(action, "change") == 0) {
 | 
						|
			class = EC_DEV_STATUS;
 | 
						|
			subclass = ESC_DEV_DLE;
 | 
						|
		} else {
 | 
						|
			zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
 | 
						|
			    action);
 | 
						|
			udev_device_unref(dev);
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Special case an EC_DEV_ADD for multipath devices
 | 
						|
		 *
 | 
						|
		 * When a multipath device is created, udev reports the
 | 
						|
		 * following:
 | 
						|
		 *
 | 
						|
		 * 1.	"add" event of the dm device for the multipath device
 | 
						|
		 *	(like /dev/dm-3).
 | 
						|
		 * 2.	"change" event to create the actual multipath device
 | 
						|
		 *	symlink (like /dev/mapper/mpatha).  The event also
 | 
						|
		 *	passes back the relevant DM vars we care about, like
 | 
						|
		 *	DM_UUID.
 | 
						|
		 * 3.	Another "change" event identical to #2 (that we ignore).
 | 
						|
		 *
 | 
						|
		 * To get the behavior we want, we treat the "change" event
 | 
						|
		 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
 | 
						|
		 * a new disk being added.
 | 
						|
		 */
 | 
						|
		if (strcmp(class, EC_DEV_STATUS) == 0 &&
 | 
						|
		    udev_device_get_property_value(dev, "DM_UUID") &&
 | 
						|
		    udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
 | 
						|
			tmp = (char *)udev_device_get_devnode(dev);
 | 
						|
			tmp2 = zfs_get_underlying_path(tmp);
 | 
						|
			if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
 | 
						|
				/*
 | 
						|
				 * We have a real underlying device, which
 | 
						|
				 * means that this multipath "change" event is
 | 
						|
				 * an "add" event.
 | 
						|
				 *
 | 
						|
				 * If the multipath device and the underlying
 | 
						|
				 * dev are the same name (i.e. /dev/dm-5), then
 | 
						|
				 * there is no real underlying disk for this
 | 
						|
				 * multipath device, and so this "change" event
 | 
						|
				 * really is a multipath removal.
 | 
						|
				 */
 | 
						|
				class = EC_DEV_ADD;
 | 
						|
				subclass = ESC_DISK;
 | 
						|
			} else {
 | 
						|
				tmp = (char *)
 | 
						|
				    udev_device_get_property_value(dev,
 | 
						|
				    "DM_NR_VALID_PATHS");
 | 
						|
				/* treat as a multipath remove */
 | 
						|
				if (tmp != NULL && strcmp(tmp, "0") == 0) {
 | 
						|
					class = EC_DEV_REMOVE;
 | 
						|
					subclass = ESC_DISK;
 | 
						|
				}
 | 
						|
			}
 | 
						|
			free(tmp2);
 | 
						|
		}
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Special case an EC_DEV_ADD for scsi_debug devices
 | 
						|
		 *
 | 
						|
		 * These devices require a udevadm trigger command after
 | 
						|
		 * creation in order to register the vdev_id scsidebug alias
 | 
						|
		 * rule (adds a persistent path (phys_path) used for fault
 | 
						|
		 * management automated tests in the ZFS test suite.
 | 
						|
		 *
 | 
						|
		 * After udevadm trigger command, event registers as a "change"
 | 
						|
		 * event but needs to instead be handled as another "add" event
 | 
						|
		 * to allow for disk labeling and partitioning to occur.
 | 
						|
		 */
 | 
						|
		if (strcmp(class, EC_DEV_STATUS) == 0 &&
 | 
						|
		    udev_device_get_property_value(dev, "ID_VDEV") &&
 | 
						|
		    udev_device_get_property_value(dev, "ID_MODEL")) {
 | 
						|
			const char *id_model, *id_model_sd = "scsi_debug";
 | 
						|
 | 
						|
			id_model = udev_device_get_property_value(dev,
 | 
						|
			    "ID_MODEL");
 | 
						|
			if (strcmp(id_model, id_model_sd) == 0) {
 | 
						|
				class = EC_DEV_ADD;
 | 
						|
				subclass = ESC_DISK;
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		if ((nvl = dev_event_nvlist(dev)) != NULL) {
 | 
						|
			zed_udev_event(class, subclass, nvl);
 | 
						|
			nvlist_free(nvl);
 | 
						|
		}
 | 
						|
 | 
						|
		udev_device_unref(dev);
 | 
						|
	}
 | 
						|
 | 
						|
	return (NULL);
 | 
						|
}
 | 
						|
 | 
						|
int
 | 
						|
zed_disk_event_init()
 | 
						|
{
 | 
						|
	int fd, fflags;
 | 
						|
 | 
						|
	if ((g_udev = udev_new()) == NULL) {
 | 
						|
		zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
 | 
						|
		return (-1);
 | 
						|
	}
 | 
						|
 | 
						|
	/* Set up a udev monitor for block devices */
 | 
						|
	g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
 | 
						|
	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
 | 
						|
	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
 | 
						|
	    "partition");
 | 
						|
	udev_monitor_enable_receiving(g_mon);
 | 
						|
 | 
						|
	/* Make sure monitoring socket is blocking */
 | 
						|
	fd = udev_monitor_get_fd(g_mon);
 | 
						|
	if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
 | 
						|
		(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
 | 
						|
 | 
						|
	/* spawn a thread to monitor events */
 | 
						|
	if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
 | 
						|
		udev_monitor_unref(g_mon);
 | 
						|
		udev_unref(g_udev);
 | 
						|
		zed_log_msg(LOG_WARNING, "pthread_create failed");
 | 
						|
		return (-1);
 | 
						|
	}
 | 
						|
 | 
						|
	pthread_setname_np(g_mon_tid, "udev monitor");
 | 
						|
	zed_log_msg(LOG_INFO, "zed_disk_event_init");
 | 
						|
 | 
						|
	return (0);
 | 
						|
}
 | 
						|
 | 
						|
void
 | 
						|
zed_disk_event_fini()
 | 
						|
{
 | 
						|
	/* cancel monitor thread at recvmsg() */
 | 
						|
	(void) pthread_cancel(g_mon_tid);
 | 
						|
	(void) pthread_join(g_mon_tid, NULL);
 | 
						|
 | 
						|
	/* cleanup udev resources */
 | 
						|
	udev_monitor_unref(g_mon);
 | 
						|
	udev_unref(g_udev);
 | 
						|
 | 
						|
	zed_log_msg(LOG_INFO, "zed_disk_event_fini");
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
#include "zed_disk_event.h"
 | 
						|
 | 
						|
int
 | 
						|
zed_disk_event_init()
 | 
						|
{
 | 
						|
	return (0);
 | 
						|
}
 | 
						|
 | 
						|
void
 | 
						|
zed_disk_event_fini()
 | 
						|
{
 | 
						|
}
 | 
						|
 | 
						|
#endif /* HAVE_LIBUDEV */
 |