mirror_zfs/cmd/zed/agents/zfs_retire.c
Brian Behlendorf 02730c333c Use cstyle -cpP in make cstyle check
Enable picky cstyle checks and resolve the new warnings.  The vast
majority of the changes needed were to handle minor issues with
whitespace formatting.  This patch contains no functional changes.

Non-whitespace changes are as follows:

* 8 times ; to { } in for/while loop
* fix missing ; in cmd/zed/agents/zfs_diagnosis.c
* comment (confim -> confirm)
* change endline , to ; in cmd/zpool/zpool_main.c
* a number of /* BEGIN CSTYLED */ /* END CSTYLED */ blocks
* /* CSTYLED */ markers
* change == 0 to !
* ulong to unsigned long in module/zfs/dsl_scan.c
* rearrangement of module_param lines in module/zfs/metaslab.c
* add { } block around statement after for_each_online_node

Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Håkan Johansson <f96hajo@chalmers.se>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #5465
2016-12-12 10:46:26 -08:00

643 lines
16 KiB
C

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright (c) 2016, Intel Corporation.
*/
/*
* The ZFS retire agent is responsible for managing hot spares across all pools.
* When we see a device fault or a device removal, we try to open the associated
* pool and look for any hot spares. We iterate over any available hot spares
* and attempt a 'zpool replace' for each one.
*
* For vdevs diagnosed as faulty, the agent is also responsible for proactively
* marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
*/
#include <sys/fs/zfs.h>
#include <sys/fm/protocol.h>
#include <sys/fm/fs/zfs.h>
#include <libzfs.h>
#include <string.h>
#include "zfs_agents.h"
#include "fmd_api.h"
typedef struct zfs_retire_repaired {
struct zfs_retire_repaired *zrr_next;
uint64_t zrr_pool;
uint64_t zrr_vdev;
} zfs_retire_repaired_t;
typedef struct zfs_retire_data {
libzfs_handle_t *zrd_hdl;
zfs_retire_repaired_t *zrd_repaired;
} zfs_retire_data_t;
static void
zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
{
zfs_retire_repaired_t *zrp;
while ((zrp = zdp->zrd_repaired) != NULL) {
zdp->zrd_repaired = zrp->zrr_next;
fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
}
}
/*
* Find a pool with a matching GUID.
*/
typedef struct find_cbdata {
uint64_t cb_guid;
const char *cb_fru;
zpool_handle_t *cb_zhp;
nvlist_t *cb_vdev;
} find_cbdata_t;
static int
find_pool(zpool_handle_t *zhp, void *data)
{
find_cbdata_t *cbp = data;
if (cbp->cb_guid ==
zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
cbp->cb_zhp = zhp;
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* Find a vdev within a tree with a matching GUID.
*/
static nvlist_t *
find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, const char *search_fru,
uint64_t search_guid)
{
uint64_t guid;
nvlist_t **child;
uint_t c, children;
nvlist_t *ret;
char *fru;
if (search_fru != NULL) {
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &fru) == 0 &&
libzfs_fru_compare(zhdl, fru, search_fru))
return (nv);
} else {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
guid == search_guid) {
fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
"matched vdev %llu", guid);
return (nv);
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return (NULL);
for (c = 0; c < children; c++) {
if ((ret = find_vdev(zhdl, child[c], search_fru,
search_guid)) != NULL)
return (ret);
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) != 0)
return (NULL);
for (c = 0; c < children; c++) {
if ((ret = find_vdev(zhdl, child[c], search_fru,
search_guid)) != NULL)
return (ret);
}
return (NULL);
}
/*
* Given a (pool, vdev) GUID pair, find the matching pool and vdev.
*/
static zpool_handle_t *
find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
nvlist_t **vdevp)
{
find_cbdata_t cb;
zpool_handle_t *zhp;
nvlist_t *config, *nvroot;
/*
* Find the corresponding pool and make sure the vdev still exists.
*/
cb.cb_guid = pool_guid;
if (zpool_iter(zhdl, find_pool, &cb) != 1)
return (NULL);
zhp = cb.cb_zhp;
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) != 0) {
zpool_close(zhp);
return (NULL);
}
if (vdev_guid != 0) {
if ((*vdevp = find_vdev(zhdl, nvroot, NULL,
vdev_guid)) == NULL) {
zpool_close(zhp);
return (NULL);
}
}
return (zhp);
}
#ifdef _HAS_FMD_TOPO
static int
search_pool(zpool_handle_t *zhp, void *data)
{
find_cbdata_t *cbp = data;
nvlist_t *config;
nvlist_t *nvroot;
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) != 0) {
zpool_close(zhp);
return (0);
}
if ((cbp->cb_vdev = find_vdev(zpool_get_handle(zhp), nvroot,
cbp->cb_fru, 0)) != NULL) {
cbp->cb_zhp = zhp;
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* Given a FRU FMRI, find the matching pool and vdev.
*/
static zpool_handle_t *
find_by_fru(libzfs_handle_t *zhdl, const char *fru, nvlist_t **vdevp)
{
find_cbdata_t cb;
cb.cb_fru = fru;
cb.cb_zhp = NULL;
if (zpool_iter(zhdl, search_pool, &cb) != 1)
return (NULL);
*vdevp = cb.cb_vdev;
return (cb.cb_zhp);
}
#endif /* _HAS_FMD_TOPO */
/*
* Given a vdev, attempt to replace it with every known spare until one
* succeeds.
*/
static void
replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
{
nvlist_t *config, *nvroot, *replacement;
nvlist_t **spares;
uint_t s, nspares;
char *dev_name;
config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) != 0)
return;
/*
* Find out if there are any hot spares available in the pool.
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) != 0)
return;
replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT);
dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
/*
* Try to replace each spare, ending when we successfully
* replace it.
*/
for (s = 0; s < nspares; s++) {
char *spare_name;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
&spare_name) != 0)
continue;
(void) nvlist_add_nvlist_array(replacement,
ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
dev_name, basename(spare_name));
if (zpool_vdev_attach(zhp, dev_name, spare_name,
replacement, B_TRUE) == 0)
break;
}
free(dev_name);
nvlist_free(replacement);
}
/*
* Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
* ASRU is now usable. ZFS has found the device to be present and
* functioning.
*/
/*ARGSUSED*/
static void
zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
{
zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
zfs_retire_repaired_t *zrp;
uint64_t pool_guid, vdev_guid;
#ifdef _HAS_FMD_TOPO
nvlist_t *asru;
#endif
if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
&pool_guid) != 0 || nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
return;
/*
* Before checking the state of the ASRU, go through and see if we've
* already made an attempt to repair this ASRU. This list is cleared
* whenever we receive any kind of list event, and is designed to
* prevent us from generating a feedback loop when we attempt repairs
* against a faulted pool. The problem is that checking the unusable
* state of the ASRU can involve opening the pool, which can post
* statechange events but otherwise leave the pool in the faulted
* state. This list allows us to detect when a statechange event is
* due to our own request.
*/
for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
if (zrp->zrr_pool == pool_guid &&
zrp->zrr_vdev == vdev_guid)
return;
}
#ifdef _HAS_FMD_TOPO
asru = fmd_nvl_alloc(hdl, FMD_SLEEP);
(void) nvlist_add_uint8(asru, FM_VERSION, ZFS_SCHEME_VERSION0);
(void) nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
(void) nvlist_add_uint64(asru, FM_FMRI_ZFS_POOL, pool_guid);
(void) nvlist_add_uint64(asru, FM_FMRI_ZFS_VDEV, vdev_guid);
/*
* We explicitly check for the unusable state here to make sure we
* aren't responding to a transient state change. As part of opening a
* vdev, it's possible to see the 'statechange' event, only to be
* followed by a vdev failure later. If we don't check the current
* state of the vdev (or pool) before marking it repaired, then we risk
* generating spurious repair events followed immediately by the same
* diagnosis.
*
* This assumes that the ZFS scheme code associated unusable (i.e.
* isolated) with its own definition of faulty state. In the case of a
* DEGRADED leaf vdev (due to checksum errors), this is not the case.
* This works, however, because the transient state change is not
* posted in this case. This could be made more explicit by not
* relying on the scheme's unusable callback and instead directly
* checking the vdev state, where we could correctly account for
* DEGRADED state.
*/
if (!fmd_nvl_fmri_unusable(hdl, asru) && fmd_nvl_fmri_has_fault(hdl,
asru, FMD_HAS_FAULT_ASRU, NULL)) {
topo_hdl_t *thp;
char *fmri = NULL;
int err;
thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
if (topo_fmri_nvl2str(thp, asru, &fmri, &err) == 0)
(void) fmd_repair_asru(hdl, fmri);
fmd_hdl_topo_rele(hdl, thp);
topo_hdl_strfree(thp, fmri);
}
nvlist_free(asru);
#endif
zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
zrp->zrr_next = zdp->zrd_repaired;
zrp->zrr_pool = pool_guid;
zrp->zrr_vdev = vdev_guid;
zdp->zrd_repaired = zrp;
fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",
vdev_guid, pool_guid);
}
/*ARGSUSED*/
static void
zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
const char *class)
{
uint64_t pool_guid, vdev_guid;
zpool_handle_t *zhp;
nvlist_t *resource, *fault;
nvlist_t **faults;
uint_t f, nfaults;
zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
libzfs_handle_t *zhdl = zdp->zrd_hdl;
boolean_t fault_device, degrade_device;
boolean_t is_repair;
char *scheme;
nvlist_t *vdev = NULL;
char *uuid;
int repair_done = 0;
boolean_t retire;
boolean_t is_disk;
vdev_aux_t aux;
uint64_t state = 0;
fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
/*
* If this is a resource notifying us of device removal, then simply
* check for an available spare and continue.
*/
if (strcmp(class, "resource.fs.zfs.removed") == 0) {
if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
&pool_guid) != 0 ||
nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
&vdev_guid) != 0)
return;
if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
&vdev)) == NULL)
return;
if (fmd_prop_get_int32(hdl, "spare_on_remove"))
replace_with_spare(hdl, zhp, vdev);
zpool_close(zhp);
return;
}
if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
return;
/*
* Note: on zfsonlinux statechange events are more than just
* healthy ones so we need to confirm the actual state value.
*/
if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE,
&state) == 0 && state == VDEV_STATE_HEALTHY) {
zfs_vdev_repair(hdl, nvl);
return;
}
if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
zfs_vdev_repair(hdl, nvl);
return;
}
zfs_retire_clear_data(hdl, zdp);
if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
is_repair = B_TRUE;
else
is_repair = B_FALSE;
/*
* We subscribe to zfs faults as well as all repair events.
*/
if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
&faults, &nfaults) != 0)
return;
for (f = 0; f < nfaults; f++) {
fault = faults[f];
fault_device = B_FALSE;
degrade_device = B_FALSE;
is_disk = B_FALSE;
if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
&retire) == 0 && retire == 0)
continue;
/*
* While we subscribe to fault.fs.zfs.*, we only take action
* for faults targeting a specific vdev (open failure or SERD
* failure). We also subscribe to fault.io.* events, so that
* faulty disks will be faulted in the ZFS configuration.
*/
if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
fault_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.vdev.checksum")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.device")) {
fault_device = B_FALSE;
} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
is_disk = B_TRUE;
fault_device = B_TRUE;
} else {
continue;
}
if (is_disk) {
#ifdef _HAS_FMD_TOPO
/*
* This is a disk fault. Lookup the FRU, convert it to
* an FMRI string, and attempt to find a matching vdev.
*/
if (nvlist_lookup_nvlist(fault, FM_FAULT_FRU,
&fru) != 0 ||
nvlist_lookup_string(fru, FM_FMRI_SCHEME,
&scheme) != 0)
continue;
if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0)
continue;
thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION);
if (topo_fmri_nvl2str(thp, fru, &fmri, &err) != 0) {
fmd_hdl_topo_rele(hdl, thp);
continue;
}
zhp = find_by_fru(zhdl, fmri, &vdev);
topo_hdl_strfree(thp, fmri);
fmd_hdl_topo_rele(hdl, thp);
if (zhp == NULL)
continue;
(void) nvlist_lookup_uint64(vdev,
ZPOOL_CONFIG_GUID, &vdev_guid);
aux = VDEV_AUX_EXTERNAL;
#else
continue;
#endif
} else {
/*
* This is a ZFS fault. Lookup the resource, and
* attempt to find the matching vdev.
*/
if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
&resource) != 0 ||
nvlist_lookup_string(resource, FM_FMRI_SCHEME,
&scheme) != 0)
continue;
if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
continue;
if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
&pool_guid) != 0)
continue;
if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
&vdev_guid) != 0) {
if (is_repair)
vdev_guid = 0;
else
continue;
}
if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
&vdev)) == NULL)
continue;
aux = VDEV_AUX_ERR_EXCEEDED;
}
if (vdev_guid == 0) {
/*
* For pool-level repair events, clear the entire pool.
*/
fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",
zpool_get_name(zhp));
(void) zpool_clear(zhp, NULL, NULL);
zpool_close(zhp);
continue;
}
/*
* If this is a repair event, then mark the vdev as repaired and
* continue.
*/
if (is_repair) {
repair_done = 1;
fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",
zpool_get_name(zhp), vdev_guid);
(void) zpool_vdev_clear(zhp, vdev_guid);
zpool_close(zhp);
continue;
}
/*
* Actively fault the device if needed.
*/
if (fault_device)
(void) zpool_vdev_fault(zhp, vdev_guid, aux);
if (degrade_device)
(void) zpool_vdev_degrade(zhp, vdev_guid, aux);
if (fault_device || degrade_device)
fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",
fault_device ? "fault" : "degrade", vdev_guid,
zpool_get_name(zhp));
/*
* Attempt to substitute a hot spare.
*/
replace_with_spare(hdl, zhp, vdev);
zpool_close(zhp);
}
if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
fmd_case_uuresolved(hdl, uuid);
}
static const fmd_hdl_ops_t fmd_ops = {
zfs_retire_recv, /* fmdo_recv */
NULL, /* fmdo_timeout */
NULL, /* fmdo_close */
NULL, /* fmdo_stats */
NULL, /* fmdo_gc */
};
static const fmd_prop_t fmd_props[] = {
{ "spare_on_remove", FMD_TYPE_BOOL, "true" },
{ NULL, 0, NULL }
};
static const fmd_hdl_info_t fmd_info = {
"ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
};
void
_zfs_retire_init(fmd_hdl_t *hdl)
{
zfs_retire_data_t *zdp;
libzfs_handle_t *zhdl;
if ((zhdl = __libzfs_init()) == NULL)
return;
if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
libzfs_fini(zhdl);
return;
}
zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
zdp->zrd_hdl = zhdl;
fmd_hdl_setspecific(hdl, zdp);
}
void
_zfs_retire_fini(fmd_hdl_t *hdl)
{
zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
if (zdp != NULL) {
zfs_retire_clear_data(hdl, zdp);
__libzfs_fini(zdp->zrd_hdl);
fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
}
}