2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2016-01-21 03:31:44 +03:00
|
|
|
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2020-05-07 19:36:33 +03:00
|
|
|
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
|
2017-02-08 01:02:27 +03:00
|
|
|
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
|
2018-06-08 04:07:29 +03:00
|
|
|
* Copyright (c) 2018 Datto Inc.
|
2017-10-26 22:26:09 +03:00
|
|
|
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
|
2018-09-06 04:33:36 +03:00
|
|
|
* Copyright (c) 2017, Intel Corporation.
|
2018-11-06 21:14:56 +03:00
|
|
|
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
|
2021-02-18 08:30:45 +03:00
|
|
|
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <libintl.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <strings.h>
|
|
|
|
#include <unistd.h>
|
2013-08-28 15:45:09 +04:00
|
|
|
#include <libgen.h>
|
2010-08-26 22:56:53 +04:00
|
|
|
#include <zone.h>
|
|
|
|
#include <sys/stat.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/efi_partition.h>
|
2017-08-10 01:31:08 +03:00
|
|
|
#include <sys/systeminfo.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_ioctl.h>
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
#include <sys/zfs_sysfs.h>
|
2018-05-31 20:36:37 +03:00
|
|
|
#include <sys/vdev_disk.h>
|
2021-02-18 08:30:45 +03:00
|
|
|
#include <sys/types.h>
|
2009-07-03 02:44:48 +04:00
|
|
|
#include <dlfcn.h>
|
2018-11-05 22:22:33 +03:00
|
|
|
#include <libzutil.h>
|
2021-02-18 08:30:45 +03:00
|
|
|
#include <fcntl.h>
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#include "zfs_namecheck.h"
|
|
|
|
#include "zfs_prop.h"
|
|
|
|
#include "libzfs_impl.h"
|
2010-05-29 00:45:14 +04:00
|
|
|
#include "zfs_comutil.h"
|
2012-12-14 03:24:15 +04:00
|
|
|
#include "zfeature_common.h"
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-01-19 20:20:58 +03:00
|
|
|
static boolean_t zpool_vdev_is_interior(const char *name);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
typedef struct prop_flags {
|
2022-11-01 23:43:32 +03:00
|
|
|
unsigned int create:1; /* Validate property on creation */
|
|
|
|
unsigned int import:1; /* Validate property on import */
|
2010-08-27 01:24:34 +04:00
|
|
|
} prop_flags_t;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* ====================================================================
|
|
|
|
* zpool property functions
|
|
|
|
* ====================================================================
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
zpool_get_all_props(zpool_handle_t *zhp)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
|
|
|
|
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
2019-10-24 03:29:43 +03:00
|
|
|
while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (errno == ENOMEM) {
|
|
|
|
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
int
|
2008-11-20 23:01:55 +03:00
|
|
|
zpool_props_refresh(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
nvlist_t *old_props;
|
|
|
|
|
|
|
|
old_props = zhp->zpool_props;
|
|
|
|
|
|
|
|
if (zpool_get_all_props(zhp) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
nvlist_free(old_props);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2018-03-10 00:47:32 +03:00
|
|
|
static const char *
|
2008-11-20 23:01:55 +03:00
|
|
|
zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
|
|
|
|
zprop_source_t *src)
|
|
|
|
{
|
|
|
|
nvlist_t *nv, *nvl;
|
|
|
|
uint64_t ival;
|
|
|
|
char *value;
|
|
|
|
zprop_source_t source;
|
|
|
|
|
|
|
|
nvl = zhp->zpool_props;
|
|
|
|
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
|
|
|
|
source = ival;
|
|
|
|
verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
|
|
|
|
} else {
|
|
|
|
source = ZPROP_SRC_DEFAULT;
|
|
|
|
if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
|
|
|
|
value = "-";
|
|
|
|
}
|
|
|
|
|
|
|
|
if (src)
|
|
|
|
*src = source;
|
|
|
|
|
|
|
|
return (value);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src)
|
|
|
|
{
|
|
|
|
nvlist_t *nv, *nvl;
|
|
|
|
uint64_t value;
|
|
|
|
zprop_source_t source;
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
|
|
|
|
/*
|
|
|
|
* zpool_get_all_props() has most likely failed because
|
|
|
|
* the pool is faulted, but if all we need is the top level
|
|
|
|
* vdev's guid then get it from the zhp config nvlist.
|
|
|
|
*/
|
|
|
|
if ((prop == ZPOOL_PROP_GUID) &&
|
|
|
|
(nvlist_lookup_nvlist(zhp->zpool_config,
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
|
|
|
|
(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
|
|
|
|
== 0)) {
|
|
|
|
return (value);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zpool_prop_default_numeric(prop));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
nvl = zhp->zpool_props;
|
|
|
|
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
|
|
|
|
source = value;
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
|
|
|
|
} else {
|
|
|
|
source = ZPROP_SRC_DEFAULT;
|
|
|
|
value = zpool_prop_default_numeric(prop);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (src)
|
|
|
|
*src = source;
|
|
|
|
|
|
|
|
return (value);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map VDEV STATE to printed strings.
|
|
|
|
*/
|
2018-03-10 00:47:32 +03:00
|
|
|
const char *
|
2008-11-20 23:01:55 +03:00
|
|
|
zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
|
|
|
|
{
|
|
|
|
switch (state) {
|
|
|
|
case VDEV_STATE_CLOSED:
|
|
|
|
case VDEV_STATE_OFFLINE:
|
|
|
|
return (gettext("OFFLINE"));
|
|
|
|
case VDEV_STATE_REMOVED:
|
|
|
|
return (gettext("REMOVED"));
|
|
|
|
case VDEV_STATE_CANT_OPEN:
|
2008-12-03 23:09:06 +03:00
|
|
|
if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (gettext("FAULTED"));
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (aux == VDEV_AUX_SPLIT_POOL)
|
|
|
|
return (gettext("SPLIT"));
|
2008-11-20 23:01:55 +03:00
|
|
|
else
|
|
|
|
return (gettext("UNAVAIL"));
|
|
|
|
case VDEV_STATE_FAULTED:
|
|
|
|
return (gettext("FAULTED"));
|
|
|
|
case VDEV_STATE_DEGRADED:
|
|
|
|
return (gettext("DEGRADED"));
|
|
|
|
case VDEV_STATE_HEALTHY:
|
|
|
|
return (gettext("ONLINE"));
|
2017-02-08 01:02:27 +03:00
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (gettext("UNKNOWN"));
|
|
|
|
}
|
|
|
|
|
2013-07-05 15:01:44 +04:00
|
|
|
/*
|
|
|
|
* Map POOL STATE to printed strings.
|
|
|
|
*/
|
|
|
|
const char *
|
|
|
|
zpool_pool_state_to_name(pool_state_t state)
|
|
|
|
{
|
|
|
|
switch (state) {
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case POOL_STATE_ACTIVE:
|
|
|
|
return (gettext("ACTIVE"));
|
|
|
|
case POOL_STATE_EXPORTED:
|
|
|
|
return (gettext("EXPORTED"));
|
|
|
|
case POOL_STATE_DESTROYED:
|
|
|
|
return (gettext("DESTROYED"));
|
|
|
|
case POOL_STATE_SPARE:
|
|
|
|
return (gettext("SPARE"));
|
|
|
|
case POOL_STATE_L2CACHE:
|
|
|
|
return (gettext("L2CACHE"));
|
|
|
|
case POOL_STATE_UNINITIALIZED:
|
|
|
|
return (gettext("UNINITIALIZED"));
|
|
|
|
case POOL_STATE_UNAVAIL:
|
|
|
|
return (gettext("UNAVAIL"));
|
|
|
|
case POOL_STATE_POTENTIALLY_ACTIVE:
|
|
|
|
return (gettext("POTENTIALLY_ACTIVE"));
|
|
|
|
}
|
|
|
|
|
|
|
|
return (gettext("UNKNOWN"));
|
|
|
|
}
|
|
|
|
|
2018-06-06 19:33:54 +03:00
|
|
|
/*
|
|
|
|
* Given a pool handle, return the pool health string ("ONLINE", "DEGRADED",
|
|
|
|
* "SUSPENDED", etc).
|
|
|
|
*/
|
|
|
|
const char *
|
|
|
|
zpool_get_state_str(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
zpool_errata_t errata;
|
|
|
|
zpool_status_t status;
|
|
|
|
nvlist_t *nvroot;
|
|
|
|
vdev_stat_t *vs;
|
|
|
|
uint_t vsc;
|
|
|
|
const char *str;
|
|
|
|
|
|
|
|
status = zpool_get_status(zhp, NULL, &errata);
|
|
|
|
|
|
|
|
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
|
|
|
|
str = gettext("FAULTED");
|
|
|
|
} else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT ||
|
|
|
|
status == ZPOOL_STATUS_IO_FAILURE_MMP) {
|
|
|
|
str = gettext("SUSPENDED");
|
|
|
|
} else {
|
|
|
|
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
|
|
|
|
verify(nvlist_lookup_uint64_array(nvroot,
|
|
|
|
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
|
|
|
|
== 0);
|
|
|
|
str = zpool_state_to_name(vs->vs_state, vs->vs_aux);
|
|
|
|
}
|
|
|
|
return (str);
|
|
|
|
}
|
|
|
|
|
2013-10-23 12:33:33 +04:00
|
|
|
/*
|
|
|
|
* Get a zpool property value for 'prop' and return the value in
|
|
|
|
* a pre-allocated buffer.
|
|
|
|
*/
|
|
|
|
int
|
2016-05-10 00:03:18 +03:00
|
|
|
zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
|
2013-11-01 23:26:11 +04:00
|
|
|
size_t len, zprop_source_t *srctype, boolean_t literal)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
uint64_t intval;
|
|
|
|
const char *strval;
|
|
|
|
zprop_source_t src = ZPROP_SRC_NONE;
|
|
|
|
|
|
|
|
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
|
2009-02-18 23:51:31 +03:00
|
|
|
switch (prop) {
|
|
|
|
case ZPOOL_PROP_NAME:
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strlcpy(buf, zpool_get_name(zhp), len);
|
2009-02-18 23:51:31 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ZPOOL_PROP_HEALTH:
|
2018-06-06 19:33:54 +03:00
|
|
|
(void) strlcpy(buf, zpool_get_state_str(zhp), len);
|
2009-02-18 23:51:31 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ZPOOL_PROP_GUID:
|
|
|
|
intval = zpool_get_prop_int(zhp, prop, &src);
|
2010-08-26 20:52:39 +04:00
|
|
|
(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
|
2009-02-18 23:51:31 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ZPOOL_PROP_ALTROOT:
|
|
|
|
case ZPOOL_PROP_CACHEFILE:
|
2011-11-15 23:01:27 +04:00
|
|
|
case ZPOOL_PROP_COMMENT:
|
2021-02-18 08:30:45 +03:00
|
|
|
case ZPOOL_PROP_COMPATIBILITY:
|
2009-02-18 23:51:31 +03:00
|
|
|
if (zhp->zpool_props != NULL ||
|
|
|
|
zpool_get_all_props(zhp) == 0) {
|
|
|
|
(void) strlcpy(buf,
|
|
|
|
zpool_get_prop_string(zhp, prop, &src),
|
|
|
|
len);
|
2016-05-10 00:03:18 +03:00
|
|
|
break;
|
2009-02-18 23:51:31 +03:00
|
|
|
}
|
2021-09-14 19:17:54 +03:00
|
|
|
fallthrough;
|
2009-02-18 23:51:31 +03:00
|
|
|
default:
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strlcpy(buf, "-", len);
|
2009-02-18 23:51:31 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srctype != NULL)
|
|
|
|
*srctype = src;
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
|
|
|
|
prop != ZPOOL_PROP_NAME)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
switch (zpool_prop_get_type(prop)) {
|
|
|
|
case PROP_TYPE_STRING:
|
|
|
|
(void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
|
|
|
|
len);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PROP_TYPE_NUMBER:
|
|
|
|
intval = zpool_get_prop_int(zhp, prop, &src);
|
|
|
|
|
|
|
|
switch (prop) {
|
|
|
|
case ZPOOL_PROP_SIZE:
|
2010-05-29 00:45:14 +04:00
|
|
|
case ZPOOL_PROP_ALLOCATED:
|
|
|
|
case ZPOOL_PROP_FREE:
|
2012-12-14 03:24:15 +04:00
|
|
|
case ZPOOL_PROP_FREEING:
|
2014-06-06 01:20:08 +04:00
|
|
|
case ZPOOL_PROP_LEAKED:
|
2011-06-16 23:56:38 +04:00
|
|
|
case ZPOOL_PROP_ASHIFT:
|
2013-10-23 12:33:33 +04:00
|
|
|
if (literal)
|
|
|
|
(void) snprintf(buf, len, "%llu",
|
2016-12-12 21:46:26 +03:00
|
|
|
(u_longlong_t)intval);
|
2013-10-23 12:33:33 +04:00
|
|
|
else
|
|
|
|
(void) zfs_nicenum(intval, buf, len);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
2014-09-12 07:07:20 +04:00
|
|
|
case ZPOOL_PROP_EXPANDSZ:
|
2016-12-17 01:11:29 +03:00
|
|
|
case ZPOOL_PROP_CHECKPOINT:
|
2014-09-12 07:07:20 +04:00
|
|
|
if (intval == 0) {
|
|
|
|
(void) strlcpy(buf, "-", len);
|
|
|
|
} else if (literal) {
|
|
|
|
(void) snprintf(buf, len, "%llu",
|
|
|
|
(u_longlong_t)intval);
|
|
|
|
} else {
|
2017-05-02 23:43:53 +03:00
|
|
|
(void) zfs_nicebytes(intval, buf, len);
|
2014-09-12 07:07:20 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case ZPOOL_PROP_CAPACITY:
|
2016-05-10 00:03:18 +03:00
|
|
|
if (literal) {
|
|
|
|
(void) snprintf(buf, len, "%llu",
|
|
|
|
(u_longlong_t)intval);
|
|
|
|
} else {
|
|
|
|
(void) snprintf(buf, len, "%llu%%",
|
|
|
|
(u_longlong_t)intval);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
2014-09-05 10:06:55 +04:00
|
|
|
case ZPOOL_PROP_FRAGMENTATION:
|
|
|
|
if (intval == UINT64_MAX) {
|
|
|
|
(void) strlcpy(buf, "-", len);
|
2016-05-16 22:29:54 +03:00
|
|
|
} else if (literal) {
|
|
|
|
(void) snprintf(buf, len, "%llu",
|
|
|
|
(u_longlong_t)intval);
|
2014-09-05 10:06:55 +04:00
|
|
|
} else {
|
|
|
|
(void) snprintf(buf, len, "%llu%%",
|
|
|
|
(u_longlong_t)intval);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
case ZPOOL_PROP_DEDUPRATIO:
|
2016-05-16 22:29:54 +03:00
|
|
|
if (literal)
|
|
|
|
(void) snprintf(buf, len, "%llu.%02llu",
|
|
|
|
(u_longlong_t)(intval / 100),
|
|
|
|
(u_longlong_t)(intval % 100));
|
|
|
|
else
|
|
|
|
(void) snprintf(buf, len, "%llu.%02llux",
|
|
|
|
(u_longlong_t)(intval / 100),
|
|
|
|
(u_longlong_t)(intval % 100));
|
2010-05-29 00:45:14 +04:00
|
|
|
break;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case ZPOOL_PROP_HEALTH:
|
2018-06-06 19:33:54 +03:00
|
|
|
(void) strlcpy(buf, zpool_get_state_str(zhp), len);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2012-12-14 03:24:15 +04:00
|
|
|
case ZPOOL_PROP_VERSION:
|
|
|
|
if (intval >= SPA_VERSION_FEATURES) {
|
|
|
|
(void) snprintf(buf, len, "-");
|
|
|
|
break;
|
|
|
|
}
|
2021-09-14 19:17:54 +03:00
|
|
|
fallthrough;
|
2008-11-20 23:01:55 +03:00
|
|
|
default:
|
2010-08-26 20:52:39 +04:00
|
|
|
(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PROP_TYPE_INDEX:
|
|
|
|
intval = zpool_get_prop_int(zhp, prop, &src);
|
|
|
|
if (zpool_prop_index_to_string(prop, intval, &strval)
|
|
|
|
!= 0)
|
|
|
|
return (-1);
|
|
|
|
(void) strlcpy(buf, strval, len);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srctype)
|
|
|
|
*srctype = src;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the bootfs name has the same pool name as it is set to.
|
|
|
|
* Assuming bootfs is a valid dataset name.
|
|
|
|
*/
|
|
|
|
static boolean_t
|
2020-05-07 19:36:33 +03:00
|
|
|
bootfs_name_valid(const char *pool, const char *bootfs)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int len = strlen(pool);
|
2017-03-24 01:28:22 +03:00
|
|
|
if (bootfs[0] == '\0')
|
|
|
|
return (B_TRUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT))
|
2008-11-20 23:01:55 +03:00
|
|
|
return (B_FALSE);
|
|
|
|
|
|
|
|
if (strncmp(pool, bootfs, len) == 0 &&
|
|
|
|
(bootfs[len] == '/' || bootfs[len] == '\0'))
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given an nvlist of zpool properties to be set, validate that they are
|
|
|
|
* correct, and parse any numeric properties (index, boolean, etc) if they are
|
|
|
|
* specified as strings.
|
|
|
|
*/
|
|
|
|
static nvlist_t *
|
2008-12-03 23:09:06 +03:00
|
|
|
zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
nvpair_t *elem;
|
|
|
|
nvlist_t *retprops;
|
|
|
|
zpool_prop_t prop;
|
|
|
|
char *strval;
|
|
|
|
uint64_t intval;
|
2011-11-15 23:01:27 +04:00
|
|
|
char *slash, *check;
|
2008-11-20 23:01:55 +03:00
|
|
|
struct stat64 statbuf;
|
2008-12-03 23:09:06 +03:00
|
|
|
zpool_handle_t *zhp;
|
2021-04-12 19:08:56 +03:00
|
|
|
char report[1024];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
|
|
|
|
(void) no_memory(hdl);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
elem = NULL;
|
|
|
|
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
|
|
|
|
const char *propname = nvpair_name(elem);
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
prop = zpool_name_to_prop(propname);
|
2018-01-19 20:22:37 +03:00
|
|
|
if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) {
|
2012-12-14 03:24:15 +04:00
|
|
|
int err;
|
|
|
|
char *fname = strchr(propname, '@') + 1;
|
|
|
|
|
2013-10-08 21:13:05 +04:00
|
|
|
err = zfeature_lookup_name(fname, NULL);
|
2012-12-14 03:24:15 +04:00
|
|
|
if (err != 0) {
|
|
|
|
ASSERT3U(err, ==, ENOENT);
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
"feature '%s' unsupported by kernel"),
|
|
|
|
fname);
|
2012-12-14 03:24:15 +04:00
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvpair_type(elem) != DATA_TYPE_STRING) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"'%s' must be a string"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) nvpair_value_string(elem, &strval);
|
2016-10-26 02:17:47 +03:00
|
|
|
if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0 &&
|
|
|
|
strcmp(strval, ZFS_FEATURE_DISABLED) != 0) {
|
2012-12-14 03:24:15 +04:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' can only be set to "
|
2016-10-26 02:17:47 +03:00
|
|
|
"'enabled' or 'disabled'"), propname);
|
2018-04-12 00:45:58 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!flags.create &&
|
|
|
|
strcmp(strval, ZFS_FEATURE_DISABLED) == 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' can only be set to "
|
|
|
|
"'disabled' at creation time"), propname);
|
2012-12-14 03:24:15 +04:00
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_add_uint64(retprops, propname, 0) != 0) {
|
|
|
|
(void) no_memory(hdl);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Make sure this property is valid and applies to this type.
|
|
|
|
*/
|
2018-01-19 20:22:37 +03:00
|
|
|
if (prop == ZPOOL_PROP_INVAL) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"invalid property '%s'"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zpool_prop_readonly(prop)) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
|
|
|
|
"is readonly"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2017-08-15 00:28:43 +03:00
|
|
|
if (!flags.create && zpool_prop_setonce(prop)) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' can only be set at "
|
|
|
|
"creation time"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
|
|
|
|
&strval, &intval, errbuf) != 0)
|
|
|
|
goto error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform additional checking for specific properties.
|
|
|
|
*/
|
|
|
|
switch (prop) {
|
|
|
|
case ZPOOL_PROP_VERSION:
|
2012-12-14 03:24:15 +04:00
|
|
|
if (intval < version ||
|
|
|
|
!SPA_VERSION_IS_SUPPORTED(intval)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
2021-05-15 13:23:45 +03:00
|
|
|
"property '%s' number %llu is invalid."),
|
|
|
|
propname, (unsigned long long)intval);
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-06-16 23:56:38 +04:00
|
|
|
case ZPOOL_PROP_ASHIFT:
|
2017-03-29 03:21:11 +03:00
|
|
|
if (intval != 0 &&
|
|
|
|
(intval < ASHIFT_MIN || intval > ASHIFT_MAX)) {
|
2011-06-16 23:56:38 +04:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
2021-05-15 13:23:45 +03:00
|
|
|
"property '%s' number %llu is invalid, "
|
|
|
|
"only values between %" PRId32 " and %"
|
|
|
|
PRId32 " are allowed."),
|
|
|
|
propname, (unsigned long long)intval,
|
|
|
|
ASHIFT_MIN, ASHIFT_MAX);
|
2011-06-16 23:56:38 +04:00
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case ZPOOL_PROP_BOOTFS:
|
2010-08-27 01:24:34 +04:00
|
|
|
if (flags.create || flags.import) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' cannot be set at creation "
|
|
|
|
"or import time"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (version < SPA_VERSION_BOOTFS) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"pool must be upgraded to support "
|
|
|
|
"'%s' property"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* bootfs property value has to be a dataset name and
|
|
|
|
* the dataset has to be in the same pool as it sets to.
|
|
|
|
*/
|
2017-03-24 01:28:22 +03:00
|
|
|
if (!bootfs_name_valid(poolname, strval)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
|
|
|
|
"is an invalid name"), strval);
|
|
|
|
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"could not open pool '%s'"), poolname);
|
|
|
|
(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
zpool_close(zhp);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case ZPOOL_PROP_ALTROOT:
|
2010-08-27 01:24:34 +04:00
|
|
|
if (!flags.create && !flags.import) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' can only be set during pool "
|
|
|
|
"creation or import"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (strval[0] != '/') {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"bad alternate root '%s'"), strval);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ZPOOL_PROP_CACHEFILE:
|
|
|
|
if (strval[0] == '\0')
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (strcmp(strval, "none") == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (strval[0] != '/') {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' must be empty, an "
|
|
|
|
"absolute path, or 'none'"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
slash = strrchr(strval, '/');
|
|
|
|
|
|
|
|
if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
|
|
|
|
strcmp(slash, "/..") == 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"'%s' is not a valid file"), strval);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
*slash = '\0';
|
|
|
|
|
|
|
|
if (strval[0] != '\0' &&
|
|
|
|
(stat64(strval, &statbuf) != 0 ||
|
|
|
|
!S_ISDIR(statbuf.st_mode))) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"'%s' is not a valid directory"),
|
|
|
|
strval);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
*slash = '/';
|
|
|
|
break;
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2021-02-18 08:30:45 +03:00
|
|
|
case ZPOOL_PROP_COMPATIBILITY:
|
2021-04-12 19:08:56 +03:00
|
|
|
switch (zpool_load_compat(strval, NULL, report, 1024)) {
|
2021-02-18 08:30:45 +03:00
|
|
|
case ZPOOL_COMPATIBILITY_OK:
|
2021-04-12 19:08:56 +03:00
|
|
|
case ZPOOL_COMPATIBILITY_WARNTOKEN:
|
2021-02-18 08:30:45 +03:00
|
|
|
break;
|
|
|
|
case ZPOOL_COMPATIBILITY_BADFILE:
|
2021-04-12 19:08:56 +03:00
|
|
|
case ZPOOL_COMPATIBILITY_BADTOKEN:
|
2021-02-18 08:30:45 +03:00
|
|
|
case ZPOOL_COMPATIBILITY_NOFILES:
|
2021-05-15 13:23:45 +03:00
|
|
|
zfs_error_aux(hdl, "%s", report);
|
2021-02-18 08:30:45 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-11-15 23:01:27 +04:00
|
|
|
case ZPOOL_PROP_COMMENT:
|
|
|
|
for (check = strval; *check != '\0'; check++) {
|
|
|
|
if (!isprint(*check)) {
|
|
|
|
zfs_error_aux(hdl,
|
|
|
|
dgettext(TEXT_DOMAIN,
|
|
|
|
"comment may only have printable "
|
|
|
|
"characters"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP,
|
|
|
|
errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (strlen(strval) > ZPROP_MAX_COMMENT) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"comment must not exceed %d characters"),
|
|
|
|
ZPROP_MAX_COMMENT);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
2010-08-27 01:24:34 +04:00
|
|
|
case ZPOOL_PROP_READONLY:
|
|
|
|
if (!flags.import) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property '%s' can only be set at "
|
|
|
|
"import time"), propname);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
case ZPOOL_PROP_MULTIHOST:
|
|
|
|
if (get_system_hostid() == 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"requires a non-zero system hostid"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
break;
|
2018-11-06 21:14:56 +03:00
|
|
|
case ZPOOL_PROP_DEDUPDITTO:
|
Remove dedupditto functionality
If dedup is in use, the `dedupditto` property can be set, causing ZFS to
keep an extra copy of data that is referenced many times (>100x). The
idea was that this data is more important than other data and thus we
want to be really sure that it is not lost if the disk experiences a
small amount of random corruption.
ZFS (and system administrators) rely on the pool-level redundancy to
protect their data (e.g. mirroring or RAIDZ). Since the user/sysadmin
doesn't have control over what data will be offered extra redundancy by
dedupditto, this extra redundancy is not very useful. The bulk of the
data is still vulnerable to loss based on the pool-level redundancy.
For example, if particle strikes corrupt 0.1% of blocks, you will either
be saved by mirror/raidz, or you will be sad. This is true even if
dedupditto saved another 0.01% of blocks from being corrupted.
Therefore, the dedupditto functionality is rarely enabled (i.e. the
property is rarely set), and it fulfills its promise of increased
redundancy even more rarely.
Additionally, this feature does not work as advertised (on existing
releases), because scrub/resilver did not repair the extra (dedupditto)
copy (see https://github.com/zfsonlinux/zfs/pull/8270).
In summary, this seldom-used feature doesn't work, and even if it did it
wouldn't provide useful data protection. It has a non-trivial
maintenance burden (again see https://github.com/zfsonlinux/zfs/pull/8270).
We should remove the dedupditto functionality. For backwards
compatibility with the existing CLI, "zpool set dedupditto" will still
"succeed" (exit code zero), but won't have any effect. For backwards
compatibility with existing pools that had dedupditto enabled at some
point, the code will still be able to understand dedupditto blocks and
free them when appropriate. However, ZFS won't write any new dedupditto
blocks.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by: Alek Pinchuk <apinchuk@datto.com>
Issue #8270
Closes #8310
2019-06-20 00:54:02 +03:00
|
|
|
printf("Note: property '%s' no longer has "
|
|
|
|
"any effect\n", propname);
|
2018-11-06 21:14:56 +03:00
|
|
|
break;
|
2018-09-06 04:33:36 +03:00
|
|
|
|
2017-02-08 01:02:27 +03:00
|
|
|
default:
|
2014-06-21 03:00:11 +04:00
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (retprops);
|
|
|
|
error:
|
|
|
|
nvlist_free(retprops);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set zpool property : propname=propval.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
int ret = -1;
|
|
|
|
char errbuf[1024];
|
|
|
|
nvlist_t *nvl = NULL;
|
|
|
|
nvlist_t *realprops;
|
|
|
|
uint64_t version;
|
2010-08-27 01:24:34 +04:00
|
|
|
prop_flags_t flags = { 0 };
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) snprintf(errbuf, sizeof (errbuf),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
|
|
|
|
zhp->zpool_name);
|
|
|
|
|
|
|
|
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
|
|
|
|
return (no_memory(zhp->zpool_hdl));
|
|
|
|
|
|
|
|
if (nvlist_add_string(nvl, propname, propval) != 0) {
|
|
|
|
nvlist_free(nvl);
|
|
|
|
return (no_memory(zhp->zpool_hdl));
|
|
|
|
}
|
|
|
|
|
|
|
|
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
|
2010-08-27 01:24:34 +04:00
|
|
|
zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_free(nvl);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
nvlist_free(nvl);
|
|
|
|
nvl = realprops;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Execute the corresponding ioctl() to set this property.
|
|
|
|
*/
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
|
|
|
|
if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
|
|
|
|
nvlist_free(nvl);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);
|
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
nvlist_free(nvl);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
|
|
|
|
else
|
|
|
|
(void) zpool_props_refresh(zhp);
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2020-11-14 01:38:29 +03:00
|
|
|
zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp,
|
|
|
|
boolean_t literal)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
zprop_list_t *entry;
|
|
|
|
char buf[ZFS_MAXPROPLEN];
|
2012-12-14 03:24:15 +04:00
|
|
|
nvlist_t *features = NULL;
|
|
|
|
nvpair_t *nvp;
|
|
|
|
zprop_list_t **last;
|
|
|
|
boolean_t firstexpand = (NULL == *plp);
|
|
|
|
int i;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
last = plp;
|
|
|
|
while (*last != NULL)
|
|
|
|
last = &(*last)->pl_next;
|
|
|
|
|
|
|
|
if ((*plp)->pl_all)
|
|
|
|
features = zpool_get_features(zhp);
|
|
|
|
|
|
|
|
if ((*plp)->pl_all && firstexpand) {
|
|
|
|
for (i = 0; i < SPA_FEATURES; i++) {
|
|
|
|
zprop_list_t *entry = zfs_alloc(hdl,
|
|
|
|
sizeof (zprop_list_t));
|
|
|
|
entry->pl_prop = ZPROP_INVAL;
|
|
|
|
entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
|
|
|
|
spa_feature_table[i].fi_uname);
|
|
|
|
entry->pl_width = strlen(entry->pl_user_prop);
|
|
|
|
entry->pl_all = B_TRUE;
|
|
|
|
|
|
|
|
*last = entry;
|
|
|
|
last = &entry->pl_next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* add any unsupported features */
|
|
|
|
for (nvp = nvlist_next_nvpair(features, NULL);
|
|
|
|
nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
|
|
|
|
char *propname;
|
|
|
|
boolean_t found;
|
|
|
|
zprop_list_t *entry;
|
|
|
|
|
|
|
|
if (zfeature_is_supported(nvpair_name(nvp)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
propname = zfs_asprintf(hdl, "unsupported@%s",
|
|
|
|
nvpair_name(nvp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Before adding the property to the list make sure that no
|
|
|
|
* other pool already added the same property.
|
|
|
|
*/
|
|
|
|
found = B_FALSE;
|
|
|
|
entry = *plp;
|
|
|
|
while (entry != NULL) {
|
|
|
|
if (entry->pl_user_prop != NULL &&
|
|
|
|
strcmp(propname, entry->pl_user_prop) == 0) {
|
|
|
|
found = B_TRUE;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
entry = entry->pl_next;
|
|
|
|
}
|
|
|
|
if (found) {
|
|
|
|
free(propname);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
entry = zfs_alloc(hdl, sizeof (zprop_list_t));
|
|
|
|
entry->pl_prop = ZPROP_INVAL;
|
|
|
|
entry->pl_user_prop = propname;
|
|
|
|
entry->pl_width = strlen(entry->pl_user_prop);
|
|
|
|
entry->pl_all = B_TRUE;
|
|
|
|
|
|
|
|
*last = entry;
|
|
|
|
last = &entry->pl_next;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
for (entry = *plp; entry != NULL; entry = entry->pl_next) {
|
2020-11-14 01:38:29 +03:00
|
|
|
if (entry->pl_fixed && !literal)
|
2008-11-20 23:01:55 +03:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (entry->pl_prop != ZPROP_INVAL &&
|
|
|
|
zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
|
2020-11-14 01:38:29 +03:00
|
|
|
NULL, literal) == 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (strlen(buf) > entry->pl_width)
|
|
|
|
entry->pl_width = strlen(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
/*
|
|
|
|
* Get the state for the given feature on the given ZFS pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
|
|
|
|
size_t len)
|
|
|
|
{
|
|
|
|
uint64_t refcount;
|
|
|
|
boolean_t found = B_FALSE;
|
|
|
|
nvlist_t *features = zpool_get_features(zhp);
|
|
|
|
boolean_t supported;
|
|
|
|
const char *feature = strchr(propname, '@') + 1;
|
|
|
|
|
|
|
|
supported = zpool_prop_feature(propname);
|
|
|
|
ASSERT(supported || zpool_prop_unsupported(propname));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert from feature name to feature guid. This conversion is
|
2017-01-03 20:31:18 +03:00
|
|
|
* unnecessary for unsupported@... properties because they already
|
2012-12-14 03:24:15 +04:00
|
|
|
* use guids.
|
|
|
|
*/
|
|
|
|
if (supported) {
|
|
|
|
int ret;
|
2013-10-08 21:13:05 +04:00
|
|
|
spa_feature_t fid;
|
2012-12-14 03:24:15 +04:00
|
|
|
|
2013-10-08 21:13:05 +04:00
|
|
|
ret = zfeature_lookup_name(feature, &fid);
|
2012-12-14 03:24:15 +04:00
|
|
|
if (ret != 0) {
|
|
|
|
(void) strlcpy(buf, "-", len);
|
|
|
|
return (ENOTSUP);
|
|
|
|
}
|
2013-10-08 21:13:05 +04:00
|
|
|
feature = spa_feature_table[fid].fi_guid;
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
|
|
|
|
found = B_TRUE;
|
|
|
|
|
|
|
|
if (supported) {
|
|
|
|
if (!found) {
|
|
|
|
(void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
|
|
|
|
} else {
|
|
|
|
if (refcount == 0)
|
|
|
|
(void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
|
|
|
|
else
|
|
|
|
(void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (found) {
|
|
|
|
if (refcount == 0) {
|
|
|
|
(void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
|
|
|
|
} else {
|
|
|
|
(void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
(void) strlcpy(buf, "-", len);
|
|
|
|
return (ENOTSUP);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Validate the given pool name, optionally putting an extended error message in
|
|
|
|
* 'buf'.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
|
|
|
|
{
|
|
|
|
namecheck_err_t why;
|
|
|
|
char what;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = pool_namecheck(pool, &why, &what);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The rules for reserved pool names were extended at a later point.
|
|
|
|
* But we need to support users with existing pools that may now be
|
|
|
|
* invalid. So we only check for this expanded set of names during a
|
|
|
|
* create (or import), and only in userland.
|
|
|
|
*/
|
|
|
|
if (ret == 0 && !isopen &&
|
|
|
|
(strncmp(pool, "mirror", 6) == 0 ||
|
|
|
|
strncmp(pool, "raidz", 5) == 0 ||
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
strncmp(pool, "draid", 5) == 0 ||
|
2008-11-20 23:01:55 +03:00
|
|
|
strncmp(pool, "spare", 5) == 0 ||
|
|
|
|
strcmp(pool, "log") == 0)) {
|
|
|
|
if (hdl != NULL)
|
|
|
|
zfs_error_aux(hdl,
|
|
|
|
dgettext(TEXT_DOMAIN, "name is reserved"));
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (ret != 0) {
|
|
|
|
if (hdl != NULL) {
|
|
|
|
switch (why) {
|
|
|
|
case NAME_ERR_TOOLONG:
|
|
|
|
zfs_error_aux(hdl,
|
|
|
|
dgettext(TEXT_DOMAIN, "name is too long"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_INVALCHAR:
|
|
|
|
zfs_error_aux(hdl,
|
|
|
|
dgettext(TEXT_DOMAIN, "invalid character "
|
|
|
|
"'%c' in pool name"), what);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_NOLETTER:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"name must begin with a letter"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_RESERVED:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"name is reserved"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_DISKLIKE:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"pool name is reserved"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_LEADING_SLASH:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"leading slash in name"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_EMPTY_COMPONENT:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"empty component in name"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NAME_ERR_TRAILING_SLASH:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"trailing slash in name"));
|
|
|
|
break;
|
|
|
|
|
2017-01-27 01:42:15 +03:00
|
|
|
case NAME_ERR_MULTIPLE_DELIMITERS:
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
2017-01-27 01:42:15 +03:00
|
|
|
"multiple '@' and/or '#' delimiters in "
|
|
|
|
"name"));
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2017-02-08 21:06:02 +03:00
|
|
|
|
2010-08-26 20:52:41 +04:00
|
|
|
case NAME_ERR_NO_AT:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"permission set is missing '@'"));
|
2017-02-08 21:06:02 +03:00
|
|
|
break;
|
2017-02-08 01:02:27 +03:00
|
|
|
|
|
|
|
default:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"(%d) not defined"), why);
|
2010-08-26 20:52:41 +04:00
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open a handle to the given pool, even if the pool is currently in the FAULTED
|
|
|
|
* state.
|
|
|
|
*/
|
|
|
|
zpool_handle_t *
|
|
|
|
zpool_open_canfail(libzfs_handle_t *hdl, const char *pool)
|
|
|
|
{
|
|
|
|
zpool_handle_t *zhp;
|
|
|
|
boolean_t missing;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure the pool name is valid.
|
|
|
|
*/
|
|
|
|
if (!zpool_name_valid(hdl, B_TRUE, pool)) {
|
|
|
|
(void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot open '%s'"),
|
|
|
|
pool);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
zhp->zpool_hdl = hdl;
|
|
|
|
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
|
|
|
|
|
|
|
|
if (zpool_refresh_stats(zhp, &missing) != 0) {
|
|
|
|
zpool_close(zhp);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (missing) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
|
|
|
|
(void) zfs_error_fmt(hdl, EZFS_NOENT,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
|
|
|
|
zpool_close(zhp);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (zhp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Like the above, but silent on error. Used when iterating over pools (because
|
|
|
|
* the configuration cache may be out of date).
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret)
|
|
|
|
{
|
|
|
|
zpool_handle_t *zhp;
|
|
|
|
boolean_t missing;
|
|
|
|
|
|
|
|
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
zhp->zpool_hdl = hdl;
|
|
|
|
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
|
|
|
|
|
|
|
|
if (zpool_refresh_stats(zhp, &missing) != 0) {
|
|
|
|
zpool_close(zhp);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (missing) {
|
|
|
|
zpool_close(zhp);
|
|
|
|
*ret = NULL;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
*ret = zhp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Similar to zpool_open_canfail(), but refuses to open pools in the faulted
|
|
|
|
* state.
|
|
|
|
*/
|
|
|
|
zpool_handle_t *
|
|
|
|
zpool_open(libzfs_handle_t *hdl, const char *pool)
|
|
|
|
{
|
|
|
|
zpool_handle_t *zhp;
|
|
|
|
|
|
|
|
if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
|
|
|
|
(void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
|
|
|
|
zpool_close(zhp);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (zhp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Close the handle. Simply frees the memory associated with the handle.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
zpool_close(zpool_handle_t *zhp)
|
|
|
|
{
|
2016-04-01 06:54:07 +03:00
|
|
|
nvlist_free(zhp->zpool_config);
|
|
|
|
nvlist_free(zhp->zpool_old_config);
|
|
|
|
nvlist_free(zhp->zpool_props);
|
2008-11-20 23:01:55 +03:00
|
|
|
free(zhp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the name of the pool.
|
|
|
|
*/
|
|
|
|
const char *
|
|
|
|
zpool_get_name(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
return (zhp->zpool_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the state of the pool (ACTIVE or UNAVAILABLE)
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_get_state(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
return (zhp->zpool_state);
|
|
|
|
}
|
|
|
|
|
2018-09-06 04:33:36 +03:00
|
|
|
/*
|
|
|
|
* Check if vdev list contains a special vdev
|
|
|
|
*/
|
|
|
|
static boolean_t
|
|
|
|
zpool_has_special_vdev(nvlist_t *nvroot)
|
|
|
|
{
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t children;
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child,
|
|
|
|
&children) == 0) {
|
|
|
|
for (uint_t c = 0; c < children; c++) {
|
|
|
|
char *bias;
|
|
|
|
|
|
|
|
if (nvlist_lookup_string(child[c],
|
|
|
|
ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 &&
|
|
|
|
strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2021-01-22 20:47:06 +03:00
|
|
|
/*
|
|
|
|
* Check if vdev list contains a dRAID vdev
|
|
|
|
*/
|
|
|
|
static boolean_t
|
|
|
|
zpool_has_draid_vdev(nvlist_t *nvroot)
|
|
|
|
{
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t children;
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
|
|
|
|
&child, &children) == 0) {
|
|
|
|
for (uint_t c = 0; c < children; c++) {
|
|
|
|
char *type;
|
|
|
|
|
|
|
|
if (nvlist_lookup_string(child[c],
|
|
|
|
ZPOOL_CONFIG_TYPE, &type) == 0 &&
|
|
|
|
strcmp(type, VDEV_TYPE_DRAID) == 0) {
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
/*
|
|
|
|
* Output a dRAID top-level vdev name in to the provided buffer.
|
|
|
|
*/
|
|
|
|
static char *
|
|
|
|
zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity,
|
|
|
|
uint64_t spares, uint64_t children)
|
|
|
|
{
|
|
|
|
snprintf(name, len, "%s%llu:%llud:%lluc:%llus",
|
|
|
|
VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data,
|
|
|
|
(u_longlong_t)children, (u_longlong_t)spares);
|
|
|
|
|
|
|
|
return (name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return B_TRUE if the provided name is a dRAID spare name.
|
|
|
|
*/
|
|
|
|
boolean_t
|
|
|
|
zpool_is_draid_spare(const char *name)
|
|
|
|
{
|
|
|
|
uint64_t spare_id, parity, vdev_id;
|
|
|
|
|
|
|
|
if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
|
|
|
|
(u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
|
|
|
|
(u_longlong_t *)&spare_id) == 3) {
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Create the named pool, using the provided vdev list. It is assumed
|
|
|
|
* that the consumer has already validated the contents of the nvlist, so we
|
|
|
|
* don't have to worry about error semantics.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
|
2008-12-03 23:09:06 +03:00
|
|
|
nvlist_t *props, nvlist_t *fsprops)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-12-03 23:09:06 +03:00
|
|
|
nvlist_t *zc_fsprops = NULL;
|
|
|
|
nvlist_t *zc_props = NULL;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
nvlist_t *hidden_args = NULL;
|
|
|
|
uint8_t *wkeydata = NULL;
|
|
|
|
uint_t wkeylen = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
2008-12-03 23:09:06 +03:00
|
|
|
int ret = -1;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot create '%s'"), pool);
|
|
|
|
|
|
|
|
if (!zpool_name_valid(hdl, B_FALSE, pool))
|
|
|
|
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
|
|
|
|
|
|
|
|
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (props) {
|
2010-08-27 01:24:34 +04:00
|
|
|
prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((zc_props = zpool_valid_proplist(hdl, pool, props,
|
2010-08-27 01:24:34 +04:00
|
|
|
SPA_VERSION_1, flags, msg)) == NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
goto create_failed;
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (fsprops) {
|
|
|
|
uint64_t zoned;
|
|
|
|
char *zonestr;
|
|
|
|
|
|
|
|
zoned = ((nvlist_lookup_string(fsprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
|
|
|
|
strcmp(zonestr, "on") == 0);
|
|
|
|
|
2016-01-14 02:05:59 +03:00
|
|
|
if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
fsprops, zoned, NULL, NULL, B_TRUE, msg)) == NULL) {
|
2008-12-03 23:09:06 +03:00
|
|
|
goto create_failed;
|
|
|
|
}
|
2018-09-06 04:33:36 +03:00
|
|
|
|
|
|
|
if (nvlist_exists(zc_fsprops,
|
|
|
|
zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) &&
|
|
|
|
!zpool_has_special_vdev(nvroot)) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"%s property requires a special vdev"),
|
|
|
|
zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADPROP, msg);
|
|
|
|
goto create_failed;
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (!zc_props &&
|
|
|
|
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
|
|
|
|
goto create_failed;
|
|
|
|
}
|
2017-10-13 20:09:04 +03:00
|
|
|
if (zfs_crypto_create(hdl, NULL, zc_fsprops, props, B_TRUE,
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
&wkeydata, &wkeylen) != 0) {
|
|
|
|
zfs_error(hdl, EZFS_CRYPTOFAILED, msg);
|
|
|
|
goto create_failed;
|
|
|
|
}
|
2008-12-03 23:09:06 +03:00
|
|
|
if (nvlist_add_nvlist(zc_props,
|
|
|
|
ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
|
|
|
|
goto create_failed;
|
|
|
|
}
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
if (wkeydata != NULL) {
|
|
|
|
if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0)
|
|
|
|
goto create_failed;
|
|
|
|
|
|
|
|
if (nvlist_add_uint8_array(hidden_args, "wkeydata",
|
|
|
|
wkeydata, wkeylen) != 0)
|
|
|
|
goto create_failed;
|
|
|
|
|
|
|
|
if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS,
|
|
|
|
hidden_args) != 0)
|
|
|
|
goto create_failed;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
|
|
|
|
goto create_failed;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
2008-12-03 23:09:06 +03:00
|
|
|
nvlist_free(zc_props);
|
|
|
|
nvlist_free(zc_fsprops);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
nvlist_free(hidden_args);
|
|
|
|
if (wkeydata != NULL)
|
|
|
|
free(wkeydata);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
case EBUSY:
|
|
|
|
/*
|
|
|
|
* This can happen if the user has specified the same
|
|
|
|
* device multiple times. We can't reliably detect this
|
|
|
|
* until we try to add it and see we already have a
|
2010-08-26 22:56:53 +04:00
|
|
|
* label. This can also happen under if the device is
|
|
|
|
* part of an active md or lvm device.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
2013-11-01 23:26:11 +04:00
|
|
|
"one or more vdevs refer to the same device, or "
|
|
|
|
"one of\nthe devices is part of an active md or "
|
|
|
|
"lvm device"));
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_BADDEV, msg));
|
|
|
|
|
2016-01-14 02:05:59 +03:00
|
|
|
case ERANGE:
|
|
|
|
/*
|
|
|
|
* This happens if the record size is smaller or larger
|
|
|
|
* than the allowed size range, or not a power of 2.
|
|
|
|
*
|
|
|
|
* NOTE: although zfs_valid_proplist is called earlier,
|
|
|
|
* this case may have slipped through since the
|
|
|
|
* pool does not exist yet and it is therefore
|
|
|
|
* impossible to read properties e.g. max blocksize
|
|
|
|
* from the pool.
|
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"record size invalid"));
|
|
|
|
return (zfs_error(hdl, EZFS_BADPROP, msg));
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case EOVERFLOW:
|
|
|
|
/*
|
|
|
|
* This occurs when one of the devices is below
|
|
|
|
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
|
|
|
|
* device was the problem device since there's no
|
|
|
|
* reliable way to determine device size from userland.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
char buf[64];
|
|
|
|
|
2017-05-02 23:43:53 +03:00
|
|
|
zfs_nicebytes(SPA_MINDEVSIZE, buf,
|
|
|
|
sizeof (buf));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more devices is less than the "
|
|
|
|
"minimum size (%s)"), buf);
|
|
|
|
}
|
|
|
|
return (zfs_error(hdl, EZFS_BADDEV, msg));
|
|
|
|
|
|
|
|
case ENOSPC:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more devices is out of space"));
|
|
|
|
return (zfs_error(hdl, EZFS_BADDEV, msg));
|
|
|
|
|
2021-01-22 20:47:06 +03:00
|
|
|
case EINVAL:
|
|
|
|
if (zpool_has_draid_vdev(nvroot) &&
|
|
|
|
zfeature_lookup_name("draid", NULL) != 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"dRAID vdevs are unsupported by the "
|
|
|
|
"kernel"));
|
|
|
|
return (zfs_error(hdl, EZFS_BADDEV, msg));
|
|
|
|
} else {
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
default:
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
create_failed:
|
2008-11-20 23:01:55 +03:00
|
|
|
zcmd_free_nvlists(&zc);
|
2008-12-03 23:09:06 +03:00
|
|
|
nvlist_free(zc_props);
|
|
|
|
nvlist_free(zc_fsprops);
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 20:36:48 +03:00
|
|
|
nvlist_free(hidden_args);
|
|
|
|
if (wkeydata != NULL)
|
|
|
|
free(wkeydata);
|
2008-12-03 23:09:06 +03:00
|
|
|
return (ret);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Destroy the given pool. It is up to the caller to ensure that there are no
|
|
|
|
* datasets left in the pool.
|
|
|
|
*/
|
|
|
|
int
|
2013-08-28 15:45:09 +04:00
|
|
|
zpool_destroy(zpool_handle_t *zhp, const char *log_str)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_handle_t *zfp = NULL;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
char msg[1024];
|
|
|
|
|
|
|
|
if (zhp->zpool_state == POOL_STATE_ACTIVE &&
|
2010-08-27 01:24:34 +04:00
|
|
|
(zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (-1);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2013-08-28 15:45:09 +04:00
|
|
|
zc.zc_history = (uint64_t)(uintptr_t)log_str;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot destroy '%s'"), zhp->zpool_name);
|
|
|
|
|
|
|
|
if (errno == EROFS) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more devices is read only"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
} else {
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zfp)
|
|
|
|
zfs_close(zfp);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zfp) {
|
|
|
|
remove_mountpoint(zfp);
|
|
|
|
zfs_close(zfp);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2016-12-17 01:11:29 +03:00
|
|
|
/*
|
|
|
|
* Create a checkpoint in the given pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_checkpoint(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
char msg[1024];
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = lzc_pool_checkpoint(zhp->zpool_name);
|
|
|
|
if (error != 0) {
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot checkpoint '%s'"), zhp->zpool_name);
|
|
|
|
(void) zpool_standard_error(hdl, error, msg);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Discard the checkpoint from the given pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_discard_checkpoint(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
char msg[1024];
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = lzc_pool_checkpoint_discard(zhp->zpool_name);
|
|
|
|
if (error != 0) {
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot discard checkpoint in '%s'"), zhp->zpool_name);
|
|
|
|
(void) zpool_standard_error(hdl, error, msg);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Add the given vdevs to the pool. The caller must have already performed the
|
|
|
|
* necessary verification to ensure that the vdev specification is well-formed.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
int ret;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
char msg[1024];
|
|
|
|
nvlist_t **spares, **l2cache;
|
|
|
|
uint_t nspares, nl2cache;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot add to '%s'"), zhp->zpool_name);
|
|
|
|
|
|
|
|
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
|
|
|
|
SPA_VERSION_SPARES &&
|
|
|
|
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
|
|
|
|
&spares, &nspares) == 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
|
|
|
|
"upgraded to add hot spares"));
|
|
|
|
return (zfs_error(hdl, EZFS_BADVERSION, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
|
|
|
|
SPA_VERSION_L2CACHE &&
|
|
|
|
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
|
|
|
|
&l2cache, &nl2cache) == 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
|
|
|
|
"upgraded to add cache devices"));
|
|
|
|
return (zfs_error(hdl, EZFS_BADVERSION, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
|
|
|
|
return (-1);
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
switch (errno) {
|
|
|
|
case EBUSY:
|
|
|
|
/*
|
|
|
|
* This can happen if the user has specified the same
|
|
|
|
* device multiple times. We can't reliably detect this
|
|
|
|
* until we try to add it and see we already have a
|
|
|
|
* label.
|
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more vdevs refer to the same device"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
case EINVAL:
|
2021-01-22 20:47:06 +03:00
|
|
|
|
|
|
|
if (zpool_has_draid_vdev(nvroot) &&
|
|
|
|
zfeature_lookup_name("draid", NULL) != 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"dRAID vdevs are unsupported by the "
|
|
|
|
"kernel"));
|
|
|
|
} else {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"invalid config; a pool with removing/"
|
|
|
|
"removed vdevs does not support adding "
|
|
|
|
"raidz or dRAID vdevs"));
|
|
|
|
}
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case EOVERFLOW:
|
|
|
|
/*
|
2019-09-03 03:53:27 +03:00
|
|
|
* This occurs when one of the devices is below
|
2008-11-20 23:01:55 +03:00
|
|
|
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
|
|
|
|
* device was the problem device since there's no
|
|
|
|
* reliable way to determine device size from userland.
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
char buf[64];
|
|
|
|
|
2017-05-02 23:43:53 +03:00
|
|
|
zfs_nicebytes(SPA_MINDEVSIZE, buf,
|
|
|
|
sizeof (buf));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"device is less than the minimum "
|
|
|
|
"size (%s)"), buf);
|
|
|
|
}
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ENOTSUP:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"pool must be upgraded to add these vdevs"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADVERSION, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = -1;
|
|
|
|
} else {
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Exports the pool from the system. The caller must ensure that there are no
|
|
|
|
* mounted datasets in the pool.
|
|
|
|
*/
|
2013-08-28 15:45:09 +04:00
|
|
|
static int
|
|
|
|
zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
|
|
|
|
const char *log_str)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
zc.zc_cookie = force;
|
2009-01-16 00:59:39 +03:00
|
|
|
zc.zc_guid = hardforce;
|
2013-08-28 15:45:09 +04:00
|
|
|
zc.zc_history = (uint64_t)(uintptr_t)log_str;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
|
|
|
|
switch (errno) {
|
|
|
|
case EXDEV:
|
|
|
|
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"use '-f' to override the following errors:\n"
|
|
|
|
"'%s' has an active shared spare which could be"
|
|
|
|
" used by other pools once '%s' is exported."),
|
|
|
|
zhp->zpool_name, zhp->zpool_name);
|
2021-05-15 13:23:45 +03:00
|
|
|
return (zfs_error_fmt(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot export '%s'"),
|
|
|
|
zhp->zpool_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
default:
|
|
|
|
return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
|
2021-05-15 13:23:45 +03:00
|
|
|
dgettext(TEXT_DOMAIN, "cannot export '%s'"),
|
|
|
|
zhp->zpool_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
int
|
2013-08-28 15:45:09 +04:00
|
|
|
zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
|
2009-01-16 00:59:39 +03:00
|
|
|
{
|
2013-08-28 15:45:09 +04:00
|
|
|
return (zpool_export_common(zhp, force, B_FALSE, log_str));
|
2009-01-16 00:59:39 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2013-08-28 15:45:09 +04:00
|
|
|
zpool_export_force(zpool_handle_t *zhp, const char *log_str)
|
2009-01-16 00:59:39 +03:00
|
|
|
{
|
2013-08-28 15:45:09 +04:00
|
|
|
return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
|
2009-01-16 00:59:39 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
static void
|
|
|
|
zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *config)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *nv = NULL;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t rewindto;
|
|
|
|
int64_t loss = -1;
|
|
|
|
struct tm t;
|
|
|
|
char timestr[128];
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (!hdl->libzfs_printerr || config == NULL)
|
|
|
|
return;
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
|
|
|
|
nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
return;
|
2012-12-14 03:24:15 +04:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
return;
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
|
2010-08-26 20:52:39 +04:00
|
|
|
strftime(timestr, 128, "%c", &t) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dryrun) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Would be able to return %s "
|
|
|
|
"to its state as of %s.\n"),
|
|
|
|
name, timestr);
|
|
|
|
} else {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Pool %s returned to its state as of %s.\n"),
|
|
|
|
name, timestr);
|
|
|
|
}
|
|
|
|
if (loss > 120) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"%s approximately %lld "),
|
|
|
|
dryrun ? "Would discard" : "Discarded",
|
2010-08-26 20:52:39 +04:00
|
|
|
((longlong_t)loss + 30) / 60);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"minutes of transactions.\n"));
|
|
|
|
} else if (loss > 0) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"%s approximately %lld "),
|
2010-08-26 20:52:39 +04:00
|
|
|
dryrun ? "Would discard" : "Discarded",
|
|
|
|
(longlong_t)loss);
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"seconds of transactions.\n"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
|
|
|
|
nvlist_t *config)
|
|
|
|
{
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *nv = NULL;
|
2010-05-29 00:45:14 +04:00
|
|
|
int64_t loss = -1;
|
|
|
|
uint64_t edata = UINT64_MAX;
|
|
|
|
uint64_t rewindto;
|
|
|
|
struct tm t;
|
|
|
|
char timestr[128];
|
|
|
|
|
|
|
|
if (!hdl->libzfs_printerr)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (reason >= 0)
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN, "action: "));
|
|
|
|
else
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN, "\t"));
|
|
|
|
|
|
|
|
/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
|
2010-08-27 01:24:34 +04:00
|
|
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
|
2012-12-14 03:24:15 +04:00
|
|
|
nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 ||
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
goto no_info;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
|
|
|
|
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
|
2010-05-29 00:45:14 +04:00
|
|
|
&edata);
|
|
|
|
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Recovery is possible, but will result in some data loss.\n"));
|
|
|
|
|
|
|
|
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
|
2010-08-26 20:52:39 +04:00
|
|
|
strftime(timestr, 128, "%c", &t) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"\tReturning the pool to its state as of %s\n"
|
|
|
|
"\tshould correct the problem. "),
|
|
|
|
timestr);
|
|
|
|
} else {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"\tReverting the pool to an earlier state "
|
|
|
|
"should correct the problem.\n\t"));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (loss > 120) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Approximately %lld minutes of data\n"
|
2010-08-26 20:52:39 +04:00
|
|
|
"\tmust be discarded, irreversibly. "),
|
|
|
|
((longlong_t)loss + 30) / 60);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (loss > 0) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Approximately %lld seconds of data\n"
|
2010-08-26 20:52:39 +04:00
|
|
|
"\tmust be discarded, irreversibly. "),
|
|
|
|
(longlong_t)loss);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
if (edata != 0 && edata != UINT64_MAX) {
|
|
|
|
if (edata == 1) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"After rewind, at least\n"
|
|
|
|
"\tone persistent user-data error will remain. "));
|
|
|
|
} else {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"After rewind, several\n"
|
|
|
|
"\tpersistent user-data errors will remain. "));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
|
|
|
|
reason >= 0 ? "clear" : "import", name);
|
|
|
|
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"A scrub of the pool\n"
|
|
|
|
"\tis strongly recommended after recovery.\n"));
|
|
|
|
return;
|
|
|
|
|
|
|
|
no_info:
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"Destroy and re-create the pool from\n\ta backup source.\n"));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* zpool_import() is a contracted interface. Should be kept the same
|
|
|
|
* if possible.
|
|
|
|
*
|
|
|
|
* Applications should use zpool_import_props() to import a pool with
|
|
|
|
* new properties value to be set.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
|
|
|
|
char *altroot)
|
|
|
|
{
|
|
|
|
nvlist_t *props = NULL;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (altroot != NULL) {
|
|
|
|
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_NOMEM,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
|
|
|
|
newname));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_add_string(props,
|
2009-01-16 00:59:39 +03:00
|
|
|
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
|
|
|
|
nvlist_add_string(props,
|
|
|
|
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_free(props);
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_NOMEM,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
|
|
|
|
newname));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
ret = zpool_import_props(hdl, config, newname, props,
|
|
|
|
ZFS_IMPORT_NORMAL);
|
2016-04-01 06:54:07 +03:00
|
|
|
nvlist_free(props);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
static void
|
|
|
|
print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
|
|
|
|
int indent)
|
|
|
|
{
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t c, children;
|
|
|
|
char *vname;
|
|
|
|
uint64_t is_log = 0;
|
|
|
|
|
|
|
|
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
|
|
|
|
&is_log);
|
|
|
|
|
|
|
|
if (name != NULL)
|
|
|
|
(void) printf("\t%*s%s%s\n", indent, "", name,
|
|
|
|
is_log ? " [log]" : "");
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
|
|
|
&child, &children) != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (c = 0; c < children; c++) {
|
2013-12-29 22:40:46 +04:00
|
|
|
vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID);
|
2010-08-27 01:24:34 +04:00
|
|
|
print_vdev_tree(hdl, vname, child[c], indent + 2);
|
|
|
|
free(vname);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-14 03:24:15 +04:00
|
|
|
void
|
|
|
|
zpool_print_unsup_feat(nvlist_t *config)
|
|
|
|
{
|
|
|
|
nvlist_t *nvinfo, *unsup_feat;
|
|
|
|
nvpair_t *nvp;
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
|
|
|
|
0);
|
|
|
|
verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
|
|
|
|
&unsup_feat) == 0);
|
|
|
|
|
|
|
|
for (nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
|
|
|
|
nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
|
|
|
|
char *desc;
|
|
|
|
|
|
|
|
verify(nvpair_type(nvp) == DATA_TYPE_STRING);
|
|
|
|
verify(nvpair_value_string(nvp, &desc) == 0);
|
|
|
|
|
|
|
|
if (strlen(desc) > 0)
|
|
|
|
(void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
|
|
|
|
else
|
|
|
|
(void) printf("\t%s\n", nvpair_name(nvp));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Import the given pool using the known configuration and a list of
|
|
|
|
* properties to be set. The configuration should have come from
|
|
|
|
* zpool_find_import(). The 'newname' parameters control whether the pool
|
|
|
|
* is imported with a different name.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *props, int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2017-02-11 01:51:09 +03:00
|
|
|
zpool_load_policy_t policy;
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_t *nv = NULL;
|
|
|
|
nvlist_t *nvinfo = NULL;
|
|
|
|
nvlist_t *missing = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
char *thename;
|
|
|
|
char *origname;
|
|
|
|
int ret;
|
2010-08-27 01:24:34 +04:00
|
|
|
int error = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
char errbuf[1024];
|
|
|
|
|
|
|
|
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
|
|
|
|
&origname) == 0);
|
|
|
|
|
|
|
|
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot import pool '%s'"), origname);
|
|
|
|
|
|
|
|
if (newname != NULL) {
|
|
|
|
if (!zpool_name_valid(hdl, B_FALSE, newname))
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
|
|
|
|
newname));
|
|
|
|
thename = (char *)newname;
|
|
|
|
} else {
|
|
|
|
thename = origname;
|
|
|
|
}
|
|
|
|
|
2016-01-21 03:31:44 +03:00
|
|
|
if (props != NULL) {
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t version;
|
2010-08-27 01:24:34 +04:00
|
|
|
prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
|
|
|
|
&version) == 0);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((props = zpool_valid_proplist(hdl, origname,
|
2016-01-21 03:31:44 +03:00
|
|
|
props, version, flags, errbuf)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (-1);
|
2016-01-21 03:31:44 +03:00
|
|
|
if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_free(props);
|
|
|
|
return (-1);
|
|
|
|
}
|
2016-01-21 03:31:44 +03:00
|
|
|
nvlist_free(props);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
|
|
|
|
&zc.zc_guid) == 0);
|
|
|
|
|
|
|
|
if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
|
2016-01-21 03:31:44 +03:00
|
|
|
zcmd_free_nvlists(&zc);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (-1);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
|
2016-01-21 03:31:44 +03:00
|
|
|
zcmd_free_nvlists(&zc);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
zc.zc_cookie = flags;
|
|
|
|
while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
|
|
|
|
errno == ENOMEM) {
|
|
|
|
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ret != 0)
|
|
|
|
error = errno;
|
|
|
|
|
|
|
|
(void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
|
2016-01-21 03:31:44 +03:00
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
zpool_get_load_policy(config, &policy);
|
2010-08-27 01:24:34 +04:00
|
|
|
|
|
|
|
if (error) {
|
2008-11-20 23:01:55 +03:00
|
|
|
char desc[1024];
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
char aux[256];
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Dry-run failed, but we print out what success
|
|
|
|
* looks like if we found a best txg
|
|
|
|
*/
|
2017-02-11 01:51:09 +03:00
|
|
|
if (policy.zlp_rewind & ZPOOL_TRY_REWIND) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_rewind_exclaim(hdl, newname ? origname : thename,
|
2010-08-27 01:24:34 +04:00
|
|
|
B_TRUE, nv);
|
|
|
|
nvlist_free(nv);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (newname == NULL)
|
|
|
|
(void) snprintf(desc, sizeof (desc),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
|
|
|
|
thename);
|
|
|
|
else
|
|
|
|
(void) snprintf(desc, sizeof (desc),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
|
|
|
|
origname, thename);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
switch (error) {
|
2008-11-20 23:01:55 +03:00
|
|
|
case ENOTSUP:
|
2012-12-14 03:24:15 +04:00
|
|
|
if (nv != NULL && nvlist_lookup_nvlist(nv,
|
|
|
|
ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
|
|
|
|
nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN, "This "
|
|
|
|
"pool uses the following feature(s) not "
|
|
|
|
"supported by this system:\n"));
|
|
|
|
zpool_print_unsup_feat(nv);
|
|
|
|
if (nvlist_exists(nvinfo,
|
|
|
|
ZPOOL_CONFIG_CAN_RDONLY)) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
|
|
|
"All unsupported features are only "
|
|
|
|
"required for writing to the pool."
|
|
|
|
"\nThe pool can be imported using "
|
|
|
|
"'-o readonly=on'.\n"));
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Unsupported version.
|
|
|
|
*/
|
|
|
|
(void) zfs_error(hdl, EZFS_BADVERSION, desc);
|
|
|
|
break;
|
|
|
|
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
case EREMOTEIO:
|
|
|
|
if (nv != NULL && nvlist_lookup_nvlist(nv,
|
|
|
|
ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) {
|
|
|
|
char *hostname = "<unknown>";
|
|
|
|
uint64_t hostid = 0;
|
|
|
|
mmp_state_t mmp_state;
|
|
|
|
|
|
|
|
mmp_state = fnvlist_lookup_uint64(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MMP_STATE);
|
|
|
|
|
|
|
|
if (nvlist_exists(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MMP_HOSTNAME))
|
|
|
|
hostname = fnvlist_lookup_string(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MMP_HOSTNAME);
|
|
|
|
|
|
|
|
if (nvlist_exists(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MMP_HOSTID))
|
|
|
|
hostid = fnvlist_lookup_uint64(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MMP_HOSTID);
|
|
|
|
|
|
|
|
if (mmp_state == MMP_STATE_ACTIVE) {
|
|
|
|
(void) snprintf(aux, sizeof (aux),
|
|
|
|
dgettext(TEXT_DOMAIN, "pool is imp"
|
|
|
|
"orted on host '%s' (hostid=%lx).\n"
|
|
|
|
"Export the pool on the other "
|
|
|
|
"system, then run 'zpool import'."),
|
|
|
|
hostname, (unsigned long) hostid);
|
|
|
|
} else if (mmp_state == MMP_STATE_NO_HOSTID) {
|
|
|
|
(void) snprintf(aux, sizeof (aux),
|
|
|
|
dgettext(TEXT_DOMAIN, "pool has "
|
|
|
|
"the multihost property on and "
|
|
|
|
"the\nsystem's hostid is not set. "
|
|
|
|
"Set a unique system hostid with "
|
2017-07-19 04:11:08 +03:00
|
|
|
"the zgenhostid(8) command.\n"));
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
}
|
|
|
|
|
2021-05-15 13:23:45 +03:00
|
|
|
(void) zfs_error_aux(hdl, "%s", aux);
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
}
|
|
|
|
(void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc);
|
|
|
|
break;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
case EINVAL:
|
|
|
|
(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
|
|
|
|
break;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
case EROFS:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more devices is read only"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, desc);
|
|
|
|
break;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
case ENXIO:
|
|
|
|
if (nv && nvlist_lookup_nvlist(nv,
|
|
|
|
ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
|
|
|
|
nvlist_lookup_nvlist(nvinfo,
|
|
|
|
ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
|
|
|
|
(void) printf(dgettext(TEXT_DOMAIN,
|
OpenZFS 9075 - Improve ZFS pool import/load process and corrupted pool recovery
Some work has been done lately to improve the debugability of the ZFS pool
load (and import) process. This includes:
7638 Refactor spa_load_impl into several functions
8961 SPA load/import should tell us why it failed
7277 zdb should be able to print zfs_dbgmsg's
To iterate on top of that, there's a few changes that were made to make the
import process more resilient and crash free. One of the first tasks during the
pool load process is to parse a config provided from userland that describes
what devices the pool is composed of. A vdev tree is generated from that config,
and then all the vdevs are opened.
The Meta Object Set (MOS) of the pool is accessed, and several metadata objects
that are necessary to load the pool are read. The exact configuration of the
pool is also stored inside the MOS. Since the configuration provided from
userland is external and might not accurately describe the vdev tree
of the pool at the txg that is being loaded, it cannot be relied upon to safely
operate the pool. For that reason, the configuration in the MOS is read early
on. In the past, the two configurations were compared together and if there was
a mismatch then the load process was aborted and an error was returned.
The latter was a good way to ensure a pool does not get corrupted, however it
made the pool load process needlessly fragile in cases where the vdev
configuration changed or the userland configuration was outdated. Since the MOS
is stored in 3 copies, the configuration provided by userland doesn't have to be
perfect in order to read its contents. Hence, a new approach has been adopted:
The pool is first opened with the untrusted userland configuration just so that
the real configuration can be read from the MOS. The trusted MOS configuration
is then used to generate a new vdev tree and the pool is re-opened.
When the pool is opened with an untrusted configuration, writes are disabled
to avoid accidentally damaging it. During reads, some sanity checks are
performed on block pointers to see if each DVA points to a known vdev;
when the configuration is untrusted, instead of panicking the system if those
checks fail we simply avoid issuing reads to the invalid DVAs.
This new two-step pool load process now allows rewinding pools accross
vdev tree changes such as device replacement, addition, etc. Loading a pool
from an external config file in a clustering environment also becomes much
safer now since the pool will import even if the config is outdated and didn't,
for instance, register a recent device addition.
With this code in place, it became relatively easy to implement a
long-sought-after feature: the ability to import a pool with missing top level
(i.e. non-redundant) devices. Note that since this almost guarantees some loss
of data, this feature is for now restricted to a read-only import.
Porting notes (ZTS):
* Fix 'make dist' target in zpool_import
* The maximum path length allowed by tar is 99 characters. Several
of the new test cases exceeded this limit resulting in them not
being included in the tarball. Shorten the names slightly.
* Set/get tunables using accessor functions.
* Get last synced txg via the "zfs_txg_history" mechanism.
* Clear zinject handlers in cleanup for import_cache_device_replaced
and import_rewind_device_replaced in order that the zpool can be
exported if there is an error.
* Increase FILESIZE to 8G in zfs-test.sh to allow for a larger
ext4 file system to be created on ZFS_DISK2. Also, there's
no need to partition ZFS_DISK2 at all. The partitioning had
already been disabled for multipath devices. Among other things,
the partitioning steals some space from the ext4 file system,
makes it difficult to accurately calculate the paramters to
parted and can make some of the tests fail.
* Increase FS_SIZE and FILE_SIZE in the zpool_import test
configuration now that FILESIZE is larger.
* Write more data in order that device evacuation take lonnger in
a couple tests.
* Use mkdir -p to avoid errors when the directory already exists.
* Remove use of sudo in import_rewind_config_changed.
Authored by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Andrew Stormont <andyjstormont@gmail.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9075
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/619c0123
Closes #7459
2016-07-22 17:39:36 +03:00
|
|
|
"The devices below are missing or "
|
|
|
|
"corrupted, use '-m' to import the pool "
|
|
|
|
"anyway:\n"));
|
2010-08-27 01:24:34 +04:00
|
|
|
print_vdev_tree(hdl, NULL, missing, 2);
|
|
|
|
(void) printf("\n");
|
|
|
|
}
|
|
|
|
(void) zpool_standard_error(hdl, error, desc);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EEXIST:
|
|
|
|
(void) zpool_standard_error(hdl, error, desc);
|
|
|
|
break;
|
|
|
|
|
2012-05-31 23:42:51 +04:00
|
|
|
case EBUSY:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"one or more devices are already in use\n"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, desc);
|
|
|
|
break;
|
2016-06-16 00:51:27 +03:00
|
|
|
case ENAMETOOLONG:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"new name of at least one dataset is longer than "
|
|
|
|
"the maximum allowable length"));
|
|
|
|
(void) zfs_error(hdl, EZFS_NAMETOOLONG, desc);
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
default:
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) zpool_standard_error(hdl, error, desc);
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_explain_recover(hdl,
|
2010-08-27 01:24:34 +04:00
|
|
|
newname ? origname : thename, -error, nv);
|
2010-05-29 00:45:14 +04:00
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_free(nv);
|
2008-11-20 23:01:55 +03:00
|
|
|
ret = -1;
|
|
|
|
} else {
|
|
|
|
zpool_handle_t *zhp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This should never fail, but play it safe anyway.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zpool_open_silent(hdl, thename, &zhp) != 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
ret = -1;
|
2010-05-29 00:45:14 +04:00
|
|
|
else if (zhp != NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
zpool_close(zhp);
|
2017-02-11 01:51:09 +03:00
|
|
|
if (policy.zlp_rewind &
|
2010-05-29 00:45:14 +04:00
|
|
|
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
|
|
|
|
zpool_rewind_exclaim(hdl, newname ? origname : thename,
|
2017-02-11 01:51:09 +03:00
|
|
|
((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
nvlist_free(nv);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/*
|
|
|
|
* Translate vdev names to guids. If a vdev_path is determined to be
|
|
|
|
* unsuitable then a vd_errlist is allocated and the vdev path and errno
|
|
|
|
* are added to it.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds,
|
|
|
|
nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist)
|
|
|
|
{
|
|
|
|
nvlist_t *errlist = NULL;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL;
|
|
|
|
elem = nvlist_next_nvpair(vds, elem)) {
|
|
|
|
boolean_t spare, cache;
|
|
|
|
|
|
|
|
char *vd_path = nvpair_name(elem);
|
|
|
|
nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache,
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
if ((tgt == NULL) || cache || spare) {
|
|
|
|
if (errlist == NULL) {
|
|
|
|
errlist = fnvlist_alloc();
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t err = (tgt == NULL) ? EZFS_NODEVICE :
|
|
|
|
(spare ? EZFS_ISSPARE : EZFS_ISL2CACHE);
|
|
|
|
fnvlist_add_int64(errlist, vd_path, err);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
|
|
|
|
fnvlist_add_uint64(vdev_guids, vd_path, guid);
|
|
|
|
|
|
|
|
char msg[MAXNAMELEN];
|
|
|
|
(void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid);
|
|
|
|
fnvlist_add_string(guids_to_paths, msg, vd_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error != 0) {
|
|
|
|
verify(errlist != NULL);
|
|
|
|
if (vd_errlist != NULL)
|
|
|
|
*vd_errlist = errlist;
|
|
|
|
else
|
|
|
|
fnvlist_free(errlist);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
static int
|
|
|
|
xlate_init_err(int err)
|
|
|
|
{
|
|
|
|
switch (err) {
|
|
|
|
case ENODEV:
|
|
|
|
return (EZFS_NODEVICE);
|
|
|
|
case EINVAL:
|
|
|
|
case EROFS:
|
|
|
|
return (EZFS_BADDEV);
|
|
|
|
case EBUSY:
|
|
|
|
return (EZFS_INITIALIZING);
|
|
|
|
case ESRCH:
|
|
|
|
return (EZFS_NO_INITIALIZE);
|
|
|
|
}
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Begin, suspend, or cancel the initialization (initializing of all free
|
|
|
|
* blocks) for the given vdevs in the given pool.
|
|
|
|
*/
|
2020-06-15 21:30:37 +03:00
|
|
|
static int
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
|
|
|
|
nvlist_t *vds, boolean_t wait)
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
{
|
2019-03-29 19:13:20 +03:00
|
|
|
int err;
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
|
|
|
nvlist_t *vdev_guids = fnvlist_alloc();
|
|
|
|
nvlist_t *guids_to_paths = fnvlist_alloc();
|
2019-03-29 19:13:20 +03:00
|
|
|
nvlist_t *vd_errlist = NULL;
|
|
|
|
nvlist_t *errlist;
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
nvpair_t *elem;
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
|
|
|
|
guids_to_paths, &vd_errlist);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
if (err != 0) {
|
|
|
|
verify(vd_errlist != NULL);
|
|
|
|
goto list_errors;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = lzc_initialize(zhp->zpool_name, cmd_type,
|
|
|
|
vdev_guids, &errlist);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
if (err != 0) {
|
2019-03-29 19:13:20 +03:00
|
|
|
if (errlist != NULL) {
|
|
|
|
vd_errlist = fnvlist_lookup_nvlist(errlist,
|
|
|
|
ZPOOL_INITIALIZE_VDEVS);
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
goto list_errors;
|
2019-03-29 19:13:20 +03:00
|
|
|
}
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
(void) zpool_standard_error(zhp->zpool_hdl, err,
|
2019-03-29 19:13:20 +03:00
|
|
|
dgettext(TEXT_DOMAIN, "operation failed"));
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
goto out;
|
2019-03-29 19:13:20 +03:00
|
|
|
}
|
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
if (wait) {
|
|
|
|
for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL;
|
|
|
|
elem = nvlist_next_nvpair(vdev_guids, elem)) {
|
|
|
|
|
|
|
|
uint64_t guid = fnvpair_value_uint64(elem);
|
|
|
|
|
|
|
|
err = lzc_wait_tag(zhp->zpool_name,
|
|
|
|
ZPOOL_WAIT_INITIALIZE, guid, NULL);
|
|
|
|
if (err != 0) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl,
|
|
|
|
err, dgettext(TEXT_DOMAIN, "error "
|
|
|
|
"waiting for '%s' to initialize"),
|
|
|
|
nvpair_name(elem));
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
list_errors:
|
2019-03-29 19:13:20 +03:00
|
|
|
for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
|
|
|
|
elem = nvlist_next_nvpair(vd_errlist, elem)) {
|
|
|
|
int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
|
|
|
|
char *path;
|
|
|
|
|
|
|
|
if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
|
|
|
|
&path) != 0)
|
|
|
|
path = nvpair_name(elem);
|
|
|
|
|
|
|
|
(void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
|
|
|
|
"cannot initialize '%s'", path);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
}
|
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
out:
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
fnvlist_free(vdev_guids);
|
2019-03-29 19:13:20 +03:00
|
|
|
fnvlist_free(guids_to_paths);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
if (vd_errlist != NULL)
|
2019-03-29 19:13:20 +03:00
|
|
|
fnvlist_free(vd_errlist);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
return (err == 0 ? 0 : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
|
|
|
|
nvlist_t *vds)
|
|
|
|
{
|
|
|
|
return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
|
|
|
|
nvlist_t *vds)
|
|
|
|
{
|
|
|
|
return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE));
|
2019-03-29 19:13:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xlate_trim_err(int err)
|
|
|
|
{
|
|
|
|
switch (err) {
|
|
|
|
case ENODEV:
|
|
|
|
return (EZFS_NODEVICE);
|
|
|
|
case EINVAL:
|
|
|
|
case EROFS:
|
|
|
|
return (EZFS_BADDEV);
|
|
|
|
case EBUSY:
|
|
|
|
return (EZFS_TRIMMING);
|
|
|
|
case ESRCH:
|
|
|
|
return (EZFS_NO_TRIM);
|
|
|
|
case EOPNOTSUPP:
|
|
|
|
return (EZFS_TRIM_NOTSUP);
|
|
|
|
}
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2020-03-05 02:07:11 +03:00
|
|
|
static int
|
|
|
|
zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
nvpair_t *elem;
|
|
|
|
|
|
|
|
for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL;
|
|
|
|
elem = nvlist_next_nvpair(vdev_guids, elem)) {
|
|
|
|
|
|
|
|
uint64_t guid = fnvpair_value_uint64(elem);
|
|
|
|
|
|
|
|
err = lzc_wait_tag(zhp->zpool_name,
|
|
|
|
ZPOOL_WAIT_TRIM, guid, NULL);
|
|
|
|
if (err != 0) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl,
|
|
|
|
err, dgettext(TEXT_DOMAIN, "error "
|
|
|
|
"waiting to trim '%s'"), nvpair_name(elem));
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/*
|
2020-05-28 03:27:28 +03:00
|
|
|
* Check errlist and report any errors, omitting ones which should be
|
|
|
|
* suppressed. Returns B_TRUE if any errors were reported.
|
2019-03-29 19:13:20 +03:00
|
|
|
*/
|
2020-05-28 03:27:28 +03:00
|
|
|
static boolean_t
|
|
|
|
check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags,
|
|
|
|
nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist)
|
2019-03-29 19:13:20 +03:00
|
|
|
{
|
|
|
|
nvpair_t *elem;
|
2020-05-28 03:27:28 +03:00
|
|
|
boolean_t reported_errs = B_FALSE;
|
|
|
|
int num_vds = 0;
|
|
|
|
int num_suppressed_errs = 0;
|
2019-03-29 19:13:20 +03:00
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
for (elem = nvlist_next_nvpair(vds, NULL);
|
|
|
|
elem != NULL; elem = nvlist_next_nvpair(vds, elem)) {
|
|
|
|
num_vds++;
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
}
|
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
for (elem = nvlist_next_nvpair(errlist, NULL);
|
|
|
|
elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) {
|
2019-03-29 19:13:20 +03:00
|
|
|
int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem));
|
|
|
|
char *path;
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
2019-03-29 19:13:20 +03:00
|
|
|
/*
|
|
|
|
* If only the pool was specified, and it was not a secure
|
|
|
|
* trim then suppress warnings for individual vdevs which
|
|
|
|
* do not support trimming.
|
|
|
|
*/
|
|
|
|
if (vd_error == EZFS_TRIM_NOTSUP &&
|
|
|
|
trim_flags->fullpool &&
|
|
|
|
!trim_flags->secure) {
|
2020-05-28 03:27:28 +03:00
|
|
|
num_suppressed_errs++;
|
2019-03-29 19:13:20 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
reported_errs = B_TRUE;
|
2019-03-29 19:13:20 +03:00
|
|
|
if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem),
|
|
|
|
&path) != 0)
|
|
|
|
path = nvpair_name(elem);
|
|
|
|
|
|
|
|
(void) zfs_error_fmt(zhp->zpool_hdl, vd_error,
|
|
|
|
"cannot trim '%s'", path);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
}
|
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
if (num_suppressed_errs == num_vds) {
|
|
|
|
(void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"no devices in pool support trim operations"));
|
|
|
|
(void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot trim")));
|
|
|
|
reported_errs = B_TRUE;
|
|
|
|
}
|
2019-03-29 19:13:20 +03:00
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
return (reported_errs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Begin, suspend, or cancel the TRIM (discarding of all free blocks) for
|
|
|
|
* the given vdevs in the given pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds,
|
|
|
|
trimflags_t *trim_flags)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
int retval = 0;
|
|
|
|
|
|
|
|
nvlist_t *vdev_guids = fnvlist_alloc();
|
|
|
|
nvlist_t *guids_to_paths = fnvlist_alloc();
|
|
|
|
nvlist_t *errlist = NULL;
|
|
|
|
|
|
|
|
err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
|
|
|
|
guids_to_paths, &errlist);
|
|
|
|
if (err != 0) {
|
|
|
|
check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist);
|
|
|
|
retval = -1;
|
|
|
|
goto out;
|
2019-03-29 19:13:20 +03:00
|
|
|
}
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
|
2020-05-28 03:27:28 +03:00
|
|
|
err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate,
|
|
|
|
trim_flags->secure, vdev_guids, &errlist);
|
|
|
|
if (err != 0) {
|
|
|
|
nvlist_t *vd_errlist;
|
|
|
|
if (errlist != NULL && nvlist_lookup_nvlist(errlist,
|
|
|
|
ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) {
|
|
|
|
if (check_trim_errs(zhp, trim_flags, guids_to_paths,
|
|
|
|
vds, vd_errlist)) {
|
|
|
|
retval = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
char msg[1024];
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "operation failed"));
|
|
|
|
zpool_standard_error(zhp->zpool_hdl, err, msg);
|
|
|
|
retval = -1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (trim_flags->wait)
|
|
|
|
retval = zpool_trim_wait(zhp, vdev_guids);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (errlist != NULL)
|
|
|
|
fnvlist_free(errlist);
|
|
|
|
fnvlist_free(vdev_guids);
|
|
|
|
fnvlist_free(guids_to_paths);
|
|
|
|
return (retval);
|
OpenZFS 9102 - zfs should be able to initialize storage devices
PROBLEM
========
The first access to a block incurs a performance penalty on some platforms
(e.g. AWS's EBS, VMware VMDKs). Therefore we recommend that volumes are
"thick provisioned", where supported by the platform (VMware). This can
create a large delay in getting a new virtual machines up and running (or
adding storage to an existing Engine). If the thick provision step is
omitted, write performance will be suboptimal until all blocks on the LUN
have been written.
SOLUTION
=========
This feature introduces a way to 'initialize' the disks at install or in the
background to make sure we don't incur this first read penalty.
When an entire LUN is added to ZFS, we make all space available immediately,
and allow ZFS to find unallocated space and zero it out. This works with
concurrent writes to arbitrary offsets, ensuring that we don't zero out
something that has been (or is in the middle of being) written. This scheme
can also be applied to existing pools (affecting only free regions on the
vdev). Detailed design:
- new subcommand:zpool initialize [-cs] <pool> [<vdev> ...]
- start, suspend, or cancel initialization
- Creates new open-context thread for each vdev
- Thread iterates through all metaslabs in this vdev
- Each metaslab:
- select a metaslab
- load the metaslab
- mark the metaslab as being zeroed
- walk all free ranges within that metaslab and translate
them to ranges on the leaf vdev
- issue a "zeroing" I/O on the leaf vdev that corresponds to
a free range on the metaslab we're working on
- continue until all free ranges for this metaslab have been
"zeroed"
- reset/unmark the metaslab being zeroed
- if more metaslabs exist, then repeat above tasks.
- if no more metaslabs, then we're done.
- progress for the initialization is stored on-disk in the vdev’s
leaf zap object. The following information is stored:
- the last offset that has been initialized
- the state of the initialization process (i.e. active,
suspended, or canceled)
- the start time for the initialization
- progress is reported via the zpool status command and shows
information for each of the vdevs that are initializing
Porting notes:
- Added zfs_initialize_value module parameter to set the pattern
written by "zpool initialize".
- Added zfs_vdev_{initializing,removal}_{min,max}_active module options.
Authored by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: loli10K <ezomori.nozomu@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/9102
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/c3963210eb
Closes #8230
2018-12-19 17:54:59 +03:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Scan the pool.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
int
|
2017-07-07 08:16:13 +03:00
|
|
|
zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
2017-07-07 08:16:13 +03:00
|
|
|
int err;
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2010-05-29 00:45:14 +04:00
|
|
|
zc.zc_cookie = func;
|
2017-07-07 08:16:13 +03:00
|
|
|
zc.zc_flags = cmd;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-07 08:16:13 +03:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
err = errno;
|
|
|
|
|
|
|
|
/* ECANCELED on a scrub means we resumed a paused scrub */
|
|
|
|
if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
|
|
|
|
cmd == POOL_SCRUB_NORMAL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (func == POOL_SCAN_SCRUB) {
|
2017-07-07 08:16:13 +03:00
|
|
|
if (cmd == POOL_SCRUB_PAUSE) {
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot pause scrubbing %s"), zc.zc_name);
|
|
|
|
} else {
|
|
|
|
assert(cmd == POOL_SCRUB_NORMAL);
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot scrub %s"), zc.zc_name);
|
|
|
|
}
|
2019-05-03 02:42:31 +03:00
|
|
|
} else if (func == POOL_SCAN_RESILVER) {
|
|
|
|
assert(cmd == POOL_SCRUB_NORMAL);
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot restart resilver on %s"), zc.zc_name);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (func == POOL_SCAN_NONE) {
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
|
|
|
|
zc.zc_name);
|
|
|
|
} else {
|
|
|
|
assert(!"unexpected result");
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-07-07 08:16:13 +03:00
|
|
|
if (err == EBUSY) {
|
2010-05-29 00:45:14 +04:00
|
|
|
nvlist_t *nvroot;
|
|
|
|
pool_scan_stat_t *ps = NULL;
|
|
|
|
uint_t psc;
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(zhp->zpool_config,
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
|
|
|
|
(void) nvlist_lookup_uint64_array(nvroot,
|
|
|
|
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
|
2020-07-03 21:05:50 +03:00
|
|
|
if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
|
|
|
|
ps->pss_state == DSS_SCANNING) {
|
2017-07-07 08:16:13 +03:00
|
|
|
if (cmd == POOL_SCRUB_PAUSE)
|
|
|
|
return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
|
|
|
|
else
|
|
|
|
return (zfs_error(hdl, EZFS_SCRUBBING, msg));
|
|
|
|
} else {
|
2010-05-29 00:45:14 +04:00
|
|
|
return (zfs_error(hdl, EZFS_RESILVERING, msg));
|
2017-07-07 08:16:13 +03:00
|
|
|
}
|
|
|
|
} else if (err == ENOENT) {
|
2010-05-29 00:45:14 +04:00
|
|
|
return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
|
2019-05-03 02:42:31 +03:00
|
|
|
} else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) {
|
|
|
|
return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, msg));
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
2017-07-07 08:16:13 +03:00
|
|
|
return (zpool_standard_error(hdl, err, msg));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* Find a vdev that matches the search criteria specified. We use the
|
|
|
|
* the nvpair name to determine how we should look for the device.
|
2008-11-20 23:01:55 +03:00
|
|
|
* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
|
|
|
|
* spare; but FALSE if its an INUSE spare.
|
|
|
|
*/
|
|
|
|
static nvlist_t *
|
2009-07-03 02:44:48 +04:00
|
|
|
vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
|
|
|
|
boolean_t *l2cache, boolean_t *log)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
uint_t c, children;
|
|
|
|
nvlist_t **child;
|
|
|
|
nvlist_t *ret;
|
2008-12-03 23:09:06 +03:00
|
|
|
uint64_t is_log;
|
2009-07-03 02:44:48 +04:00
|
|
|
char *srchkey;
|
|
|
|
nvpair_t *pair = nvlist_next_nvpair(search, NULL);
|
|
|
|
|
|
|
|
/* Nothing to look for */
|
|
|
|
if (search == NULL || pair == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
/* Obtain the key we will use to search */
|
|
|
|
srchkey = nvpair_name(pair);
|
|
|
|
|
|
|
|
switch (nvpair_type(pair)) {
|
2010-08-27 01:24:34 +04:00
|
|
|
case DATA_TYPE_UINT64:
|
2009-07-03 02:44:48 +04:00
|
|
|
if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
|
2010-08-27 01:24:34 +04:00
|
|
|
uint64_t srchval, theguid;
|
|
|
|
|
|
|
|
verify(nvpair_value_uint64(pair, &srchval) == 0);
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
|
|
|
|
&theguid) == 0);
|
|
|
|
if (theguid == srchval)
|
|
|
|
return (nv);
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DATA_TYPE_STRING: {
|
|
|
|
char *srchval, *val;
|
|
|
|
|
|
|
|
verify(nvpair_value_string(pair, &srchval) == 0);
|
|
|
|
if (nvlist_lookup_string(nv, srchkey, &val) != 0)
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Search for the requested value. Special cases:
|
|
|
|
*
|
2012-10-18 03:58:54 +04:00
|
|
|
* - ZPOOL_CONFIG_PATH for whole disk entries. These end in
|
|
|
|
* "-part1", or "p1". The suffix is hidden from the user,
|
|
|
|
* but included in the string, so this matches around it.
|
|
|
|
* - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname()
|
|
|
|
* is used to check all possible expanded paths.
|
2010-05-29 00:45:14 +04:00
|
|
|
* - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
|
|
|
|
*
|
|
|
|
* Otherwise, all other searches are simple string compares.
|
2009-07-03 02:44:48 +04:00
|
|
|
*/
|
Support shorthand names with zpool remove
zpool status displays abbreviated vdev names without leading path components
and, in the case of whole disks, without partition information. Also, the
zpool subcommands 'create' and 'add' support using shorthand devices names
without qualified paths. Prior to this change, however, removing a device
generally required specifying its name as it is stored in the vdev label. So
while zpool status might list a cache disk with a name like A16, removing it
would require a full path such as /dev/disk/zpool/A16-part1, which is
non-intuitive.
This change adds support for shorthand device names with the remove subcommand
so one can simply type, for example,
zpool remove tank A16
A consequence of this change is that including the partition information when
removing a whole-disk vdev now results in an error. While this is arguably the
correct behavior, it is a departure from how zpool previously worked in this
project.
This change removes the only reference to ctd_check_path(), so that function is
also removed to avoid compiler warnings.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2010-10-14 04:27:41 +04:00
|
|
|
if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) {
|
2009-07-03 02:44:48 +04:00
|
|
|
uint64_t wholedisk = 0;
|
|
|
|
|
|
|
|
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
|
|
|
|
&wholedisk);
|
2012-10-18 03:58:54 +04:00
|
|
|
if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0)
|
|
|
|
return (nv);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
|
|
|
|
char *type, *idx, *end, *p;
|
|
|
|
uint64_t id, vdev_id;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine our vdev type, keeping in mind
|
|
|
|
* that the srchval is composed of a type and
|
|
|
|
* vdev id pair (i.e. mirror-4).
|
|
|
|
*/
|
|
|
|
if ((type = strdup(srchval)) == NULL)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
if ((p = strrchr(type, '-')) == NULL) {
|
|
|
|
free(type);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
idx = p + 1;
|
|
|
|
*p = '\0';
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the types don't match then keep looking.
|
|
|
|
*/
|
|
|
|
if (strncmp(val, type, strlen(val)) != 0) {
|
|
|
|
free(type);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-01-19 20:20:58 +03:00
|
|
|
verify(zpool_vdev_is_interior(type));
|
2010-05-29 00:45:14 +04:00
|
|
|
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
|
|
|
|
&id) == 0);
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
vdev_id = strtoull(idx, &end, 10);
|
|
|
|
|
2021-03-26 21:12:22 +03:00
|
|
|
/*
|
|
|
|
* If we are looking for a raidz and a parity is
|
|
|
|
* specified, make sure it matches.
|
|
|
|
*/
|
|
|
|
int rzlen = strlen(VDEV_TYPE_RAIDZ);
|
|
|
|
assert(rzlen == strlen(VDEV_TYPE_DRAID));
|
|
|
|
int typlen = strlen(type);
|
|
|
|
if ((strncmp(type, VDEV_TYPE_RAIDZ, rzlen) == 0 ||
|
|
|
|
strncmp(type, VDEV_TYPE_DRAID, rzlen) == 0) &&
|
|
|
|
typlen != rzlen) {
|
|
|
|
uint64_t vdev_parity;
|
|
|
|
int parity = *(type + rzlen) - '0';
|
|
|
|
|
|
|
|
if (parity <= 0 || parity > 3 ||
|
|
|
|
(typlen - rzlen) != 1) {
|
|
|
|
/*
|
|
|
|
* Nonsense parity specified, can
|
|
|
|
* never match
|
|
|
|
*/
|
|
|
|
free(type);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
verify(nvlist_lookup_uint64(nv,
|
|
|
|
ZPOOL_CONFIG_NPARITY, &vdev_parity) == 0);
|
|
|
|
if ((int)vdev_parity != parity) {
|
|
|
|
free(type);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
free(type);
|
|
|
|
if (errno != 0)
|
|
|
|
return (NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now verify that we have the correct vdev id.
|
|
|
|
*/
|
|
|
|
if (vdev_id == id)
|
|
|
|
return (nv);
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* Common case
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
if (strcmp(srchval, val) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (nv);
|
2009-07-03 02:44:48 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
|
|
|
&child, &children) != 0)
|
|
|
|
return (NULL);
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
for (c = 0; c < children; c++) {
|
2009-07-03 02:44:48 +04:00
|
|
|
if ((ret = vdev_to_nvlist_iter(child[c], search,
|
2008-12-03 23:09:06 +03:00
|
|
|
avail_spare, l2cache, NULL)) != NULL) {
|
|
|
|
/*
|
|
|
|
* The 'is_log' value is only set for the toplevel
|
|
|
|
* vdev, not the leaf vdevs. So we always lookup the
|
|
|
|
* log device from the root of the vdev tree (where
|
|
|
|
* 'log' is non-NULL).
|
|
|
|
*/
|
|
|
|
if (log != NULL &&
|
|
|
|
nvlist_lookup_uint64(child[c],
|
|
|
|
ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
|
|
|
|
is_log) {
|
|
|
|
*log = B_TRUE;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
return (ret);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
|
|
|
|
&child, &children) == 0) {
|
|
|
|
for (c = 0; c < children; c++) {
|
2009-07-03 02:44:48 +04:00
|
|
|
if ((ret = vdev_to_nvlist_iter(child[c], search,
|
2008-12-03 23:09:06 +03:00
|
|
|
avail_spare, l2cache, NULL)) != NULL) {
|
2008-11-20 23:01:55 +03:00
|
|
|
*avail_spare = B_TRUE;
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
|
|
|
|
&child, &children) == 0) {
|
|
|
|
for (c = 0; c < children; c++) {
|
2009-07-03 02:44:48 +04:00
|
|
|
if ((ret = vdev_to_nvlist_iter(child[c], search,
|
2008-12-03 23:09:06 +03:00
|
|
|
avail_spare, l2cache, NULL)) != NULL) {
|
2008-11-20 23:01:55 +03:00
|
|
|
*l2cache = B_TRUE;
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
* Given a physical path or guid, find the associated vdev.
|
2009-07-03 02:44:48 +04:00
|
|
|
*/
|
|
|
|
nvlist_t *
|
|
|
|
zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
|
|
|
|
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
|
|
|
|
{
|
|
|
|
nvlist_t *search, *nvroot, *ret;
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
uint64_t guid;
|
|
|
|
char *end;
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
Add support for autoexpand property
While the autoexpand property may seem like a small feature it
depends on a significant amount of system infrastructure. Enough
of that infrastructure is now in place that with a few modifications
for Linux it can be supported.
Auto-expand works as follows; when a block device is modified
(re-sized, closed after being open r/w, etc) a change uevent is
generated for udev. The ZED, which is monitoring udev events,
passes the change event along to zfs_deliver_dle() if the disk
or partition contains a zfs_member as identified by blkid.
From here the device is matched against all imported pool vdevs
using the vdev_guid which was read from the label by blkid. If
a match is found the ZED reopens the pool vdev. This re-opening
is important because it allows the vdev to be briefly closed so
the disk partition table can be re-read. Otherwise, it wouldn't
be possible to report the maximum possible expansion size.
Finally, if the property autoexpand=on a vdev expansion will be
attempted. After performing some sanity checks on the disk to
verify that it is safe to expand, the primary partition (-part1)
will be expanded and the partition table updated. The partition
is then re-opened (again) to detect the updated size which allows
the new capacity to be used.
In order to make all of the above possible the following changes
were required:
* Updated the zpool_expand_001_pos and zpool_expand_003_pos tests.
These tests now create a pool which is layered on a loopback,
scsi_debug, and file vdev. This allows for testing of non-
partitioned block device (loopback), a partition block device
(scsi_debug), and a file which does not receive udev change
events. This provided for better test coverage, and by removing
the layering on ZFS volumes there issues surrounding layering
one pool on another are avoided.
* zpool_find_vdev_by_physpath() updated to accept a vdev guid.
This allows for matching by guid rather than path which is a
more reliable way for the ZED to reference a vdev.
* Fixed zfs_zevent_wait() signal handling which could result
in the ZED spinning when a signal was not handled.
* Removed vdev_disk_rrpart() functionality which can be abandoned
in favor of kernel provided blkdev_reread_part() function.
* Added a rwlock which is held as a writer while a disk is being
reopened. This is important to prevent errors from occurring
for any configuration related IOs which bypass the SCL_ZIO lock.
The zpool_reopen_007_pos.ksh test case was added to verify IO
error are never observed when reopening. This is not expected
to impact IO performance.
Additional fixes which aren't critical but were discovered and
resolved in the course of developing this functionality.
* Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for
ZFS volumes. This is as good as a unique physical path, while the
volumes are not used in the test cases anymore for other reasons
this improvement was included.
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #120
Closes #2437
Closes #5771
Closes #7366
Closes #7582
Closes #7629
2018-07-24 01:40:15 +03:00
|
|
|
|
|
|
|
guid = strtoull(ppath, &end, 0);
|
|
|
|
if (guid != 0 && *end == '\0') {
|
|
|
|
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
|
|
|
|
} else {
|
|
|
|
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH,
|
|
|
|
ppath) == 0);
|
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
|
|
|
|
&nvroot) == 0);
|
|
|
|
|
|
|
|
*avail_spare = B_FALSE;
|
2010-08-27 01:24:34 +04:00
|
|
|
*l2cache = B_FALSE;
|
|
|
|
if (log != NULL)
|
|
|
|
*log = B_FALSE;
|
2009-07-03 02:44:48 +04:00
|
|
|
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
|
|
|
|
nvlist_free(search);
|
|
|
|
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
|
|
|
|
*/
|
2018-01-19 20:20:58 +03:00
|
|
|
static boolean_t
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_vdev_is_interior(const char *name)
|
|
|
|
{
|
|
|
|
if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
|
2018-01-19 20:20:58 +03:00
|
|
|
strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 ||
|
|
|
|
strncmp(name,
|
|
|
|
VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 ||
|
2010-05-29 00:45:14 +04:00
|
|
|
strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
|
|
|
|
return (B_TRUE);
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
|
|
|
|
if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 &&
|
|
|
|
!zpool_is_draid_spare(name))
|
|
|
|
return (B_TRUE);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_t *
|
|
|
|
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t *l2cache, boolean_t *log)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
char *end;
|
2009-07-03 02:44:48 +04:00
|
|
|
nvlist_t *nvroot, *search, *ret;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t guid;
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
|
|
|
|
|
Make command line guid parsing more tolerant
Several of the zfs utilities allow you to pass a vdev's guid rather
than the device name. However, the utilities are not consistent in
how they parse that guid. For example, 'zinject' expects the guid
to be passed as a hex value while 'zpool replace' wants it as a
decimal. The user is forced to just know what format to use.
This patch improve things by making the parsing more tolerant.
When strtol(3) is called using 0 for the base, rather than say
10 or 16, it will then accept hex, decimal, or octal input based
on the prefix. From the man page.
If base is zero or 16, the string may then include a "0x"
prefix, and the number will be read in base 16; otherwise,
a zero base is taken as 10 (decimal) unless the next character
is '0', in which case it is taken as 8 (octal).
NOTE: There may be additional conversions not caught be this patch.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chris Dunlap <cdunlap@llnl.gov>
Issue #2
2014-01-25 03:27:59 +04:00
|
|
|
guid = strtoull(path, &end, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (guid != 0 && *end == '\0') {
|
2009-07-03 02:44:48 +04:00
|
|
|
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (zpool_vdev_is_interior(path)) {
|
|
|
|
verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2009-07-03 02:44:48 +04:00
|
|
|
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
|
|
|
|
&nvroot) == 0);
|
|
|
|
|
|
|
|
*avail_spare = B_FALSE;
|
|
|
|
*l2cache = B_FALSE;
|
2008-12-03 23:09:06 +03:00
|
|
|
if (log != NULL)
|
|
|
|
*log = B_FALSE;
|
2009-07-03 02:44:48 +04:00
|
|
|
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
|
|
|
|
nvlist_free(search);
|
|
|
|
|
|
|
|
return (ret);
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
vdev_is_online(nvlist_t *nv)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
|
|
|
uint64_t ival;
|
|
|
|
|
|
|
|
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
|
|
|
|
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
|
|
|
|
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2009-07-03 02:44:48 +04:00
|
|
|
* Helper function for zpool_get_physpaths().
|
2008-12-03 23:09:06 +03:00
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
static int
|
|
|
|
vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
|
|
|
|
size_t *bytes_written)
|
|
|
|
{
|
|
|
|
size_t bytes_left, pos, rsz;
|
|
|
|
char *tmppath;
|
|
|
|
const char *format;
|
|
|
|
|
|
|
|
if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
|
|
|
|
&tmppath) != 0)
|
|
|
|
return (EZFS_NODEVICE);
|
|
|
|
|
|
|
|
pos = *bytes_written;
|
|
|
|
bytes_left = physpath_size - pos;
|
|
|
|
format = (pos == 0) ? "%s" : " %s";
|
|
|
|
|
|
|
|
rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
|
|
|
|
*bytes_written += rsz;
|
|
|
|
|
|
|
|
if (rsz >= bytes_left) {
|
|
|
|
/* if physpath was not copied properly, clear it */
|
|
|
|
if (bytes_left != 0) {
|
|
|
|
physpath[pos] = 0;
|
|
|
|
}
|
|
|
|
return (EZFS_NOSPC);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
|
|
|
|
size_t *rsz, boolean_t is_spare)
|
|
|
|
{
|
|
|
|
char *type;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
|
|
|
|
return (EZFS_INVALCONFIG);
|
|
|
|
|
|
|
|
if (strcmp(type, VDEV_TYPE_DISK) == 0) {
|
|
|
|
/*
|
|
|
|
* An active spare device has ZPOOL_CONFIG_IS_SPARE set.
|
|
|
|
* For a spare vdev, we only want to boot from the active
|
|
|
|
* spare device.
|
|
|
|
*/
|
|
|
|
if (is_spare) {
|
|
|
|
uint64_t spare = 0;
|
|
|
|
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
|
|
|
|
&spare);
|
|
|
|
if (!spare)
|
|
|
|
return (EZFS_INVALCONFIG);
|
|
|
|
}
|
|
|
|
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 06:20:35 +03:00
|
|
|
if (vdev_is_online(nv)) {
|
2009-07-03 02:44:48 +04:00
|
|
|
if ((ret = vdev_get_one_physpath(nv, physpath,
|
|
|
|
phypath_size, rsz)) != 0)
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
|
2017-01-26 23:47:40 +03:00
|
|
|
strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
|
2009-07-03 02:44:48 +04:00
|
|
|
strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
|
|
|
|
(is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t count;
|
|
|
|
int i, ret;
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nv,
|
|
|
|
ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
|
|
|
|
return (EZFS_INVALCONFIG);
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++) {
|
|
|
|
ret = vdev_get_physpaths(child[i], physpath,
|
|
|
|
phypath_size, rsz, is_spare);
|
|
|
|
if (ret == EZFS_NOSPC)
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (EZFS_POOL_INVALARG);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get phys_path for a root pool config.
|
|
|
|
* Return 0 on success; non-zero on failure.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
|
2008-12-03 23:09:06 +03:00
|
|
|
{
|
2009-07-03 02:44:48 +04:00
|
|
|
size_t rsz;
|
2008-12-03 23:09:06 +03:00
|
|
|
nvlist_t *vdev_root;
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t count;
|
2009-07-03 02:44:48 +04:00
|
|
|
char *type;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
rsz = 0;
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
|
|
|
|
&vdev_root) != 0)
|
|
|
|
return (EZFS_INVALCONFIG);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
|
|
|
|
nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
|
2008-12-03 23:09:06 +03:00
|
|
|
&child, &count) != 0)
|
2009-07-03 02:44:48 +04:00
|
|
|
return (EZFS_INVALCONFIG);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
2017-01-27 21:40:02 +03:00
|
|
|
* root pool can only have a single top-level vdev.
|
2009-07-03 02:44:48 +04:00
|
|
|
*/
|
2017-01-27 21:40:02 +03:00
|
|
|
if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1)
|
2009-07-03 02:44:48 +04:00
|
|
|
return (EZFS_POOL_INVALARG);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
|
|
|
|
B_FALSE);
|
|
|
|
|
|
|
|
/* No online devices */
|
|
|
|
if (rsz == 0)
|
|
|
|
return (EZFS_NODEVICE);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* Get phys_path for a root pool
|
|
|
|
* Return 0 on success; non-zero on failure.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
|
|
|
|
{
|
|
|
|
return (zpool_get_config_physpath(zhp->zpool_config, physpath,
|
|
|
|
phypath_size));
|
|
|
|
}
|
|
|
|
|
2017-05-19 22:30:16 +03:00
|
|
|
/*
|
|
|
|
* Convert a vdev path to a GUID. Returns GUID or 0 on error.
|
|
|
|
*
|
|
|
|
* If is_spare, is_l2cache, or is_log is non-NULL, then store within it
|
|
|
|
* if the VDEV is a spare, l2cache, or log device. If they're NULL then
|
|
|
|
* ignore them.
|
|
|
|
*/
|
|
|
|
static uint64_t
|
|
|
|
zpool_vdev_path_to_guid_impl(zpool_handle_t *zhp, const char *path,
|
|
|
|
boolean_t *is_spare, boolean_t *is_l2cache, boolean_t *is_log)
|
|
|
|
{
|
|
|
|
uint64_t guid;
|
|
|
|
boolean_t spare = B_FALSE, l2cache = B_FALSE, log = B_FALSE;
|
|
|
|
nvlist_t *tgt;
|
|
|
|
|
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &spare, &l2cache,
|
|
|
|
&log)) == NULL)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &guid) == 0);
|
|
|
|
if (is_spare != NULL)
|
|
|
|
*is_spare = spare;
|
|
|
|
if (is_l2cache != NULL)
|
|
|
|
*is_l2cache = l2cache;
|
|
|
|
if (is_log != NULL)
|
|
|
|
*is_log = log;
|
|
|
|
|
|
|
|
return (guid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Convert a vdev path to a GUID. Returns GUID or 0 on error. */
|
|
|
|
uint64_t
|
|
|
|
zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path)
|
|
|
|
{
|
|
|
|
return (zpool_vdev_path_to_guid_impl(zhp, path, NULL, NULL, NULL));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Bring the specified vdev online. The 'flags' parameter is a set of the
|
|
|
|
* ZFS_ONLINE_* flags.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
|
|
|
|
vdev_state_t *newstate)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
2017-12-11 10:11:25 +03:00
|
|
|
char *pathname;
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_t *tgt;
|
2009-07-03 02:44:48 +04:00
|
|
|
boolean_t avail_spare, l2cache, islog;
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2012-07-06 17:44:14 +04:00
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (flags & ZFS_ONLINE_EXPAND) {
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
|
|
|
|
} else {
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot online %s"), path);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
|
2009-07-03 02:44:48 +04:00
|
|
|
&islog)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (avail_spare)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_ISSPARE, msg));
|
|
|
|
|
2017-12-11 10:11:25 +03:00
|
|
|
if ((flags & ZFS_ONLINE_EXPAND ||
|
|
|
|
zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) &&
|
|
|
|
nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) {
|
2009-07-03 02:44:48 +04:00
|
|
|
uint64_t wholedisk = 0;
|
|
|
|
|
|
|
|
(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
|
|
|
|
&wholedisk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX - L2ARC 1.0 devices can't support expansion.
|
|
|
|
*/
|
|
|
|
if (l2cache) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot expand cache devices"));
|
|
|
|
return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wholedisk) {
|
2012-07-06 18:22:03 +04:00
|
|
|
const char *fullpath = path;
|
|
|
|
char buf[MAXPATHLEN];
|
|
|
|
|
|
|
|
if (path[0] != '/') {
|
|
|
|
error = zfs_resolve_shortname(path, buf,
|
2013-11-01 23:26:11 +04:00
|
|
|
sizeof (buf));
|
2012-07-06 18:22:03 +04:00
|
|
|
if (error != 0)
|
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE,
|
|
|
|
msg));
|
|
|
|
|
|
|
|
fullpath = buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = zpool_relabel_disk(hdl, fullpath, msg);
|
2012-07-06 17:44:14 +04:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
zc.zc_cookie = VDEV_STATE_ONLINE;
|
|
|
|
zc.zc_obj = flags;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
if (errno == EINVAL) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
|
|
|
|
"from this pool into a new one. Use '%s' "
|
|
|
|
"instead"), "zpool detach");
|
|
|
|
return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
*newstate = zc.zc_cookie;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Take the specified vdev offline
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
nvlist_t *tgt;
|
|
|
|
boolean_t avail_spare, l2cache;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
|
|
|
|
NULL)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (avail_spare)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_ISSPARE, msg));
|
|
|
|
|
|
|
|
zc.zc_cookie = VDEV_STATE_OFFLINE;
|
|
|
|
zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
case EBUSY:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are no other replicas of this device.
|
|
|
|
*/
|
|
|
|
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
case EEXIST:
|
|
|
|
/*
|
|
|
|
* The log device has unplayed logs
|
|
|
|
*/
|
|
|
|
return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
default:
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark the given vdev faulted.
|
|
|
|
*/
|
|
|
|
int
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
2013-11-01 23:26:11 +04:00
|
|
|
dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
zc.zc_guid = guid;
|
|
|
|
zc.zc_cookie = VDEV_STATE_FAULTED;
|
2010-05-29 00:45:14 +04:00
|
|
|
zc.zc_obj = aux;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
case EBUSY:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are no other replicas of this device.
|
|
|
|
*/
|
|
|
|
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
|
|
|
|
|
|
|
|
default:
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark the given vdev degraded.
|
|
|
|
*/
|
|
|
|
int
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
2013-11-01 23:26:11 +04:00
|
|
|
dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
zc.zc_guid = guid;
|
|
|
|
zc.zc_cookie = VDEV_STATE_DEGRADED;
|
2010-05-29 00:45:14 +04:00
|
|
|
zc.zc_obj = aux;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns TRUE if the given nvlist is a vdev that was originally swapped in as
|
|
|
|
* a hot spare.
|
|
|
|
*/
|
|
|
|
static boolean_t
|
|
|
|
is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
|
|
|
|
{
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t c, children;
|
|
|
|
char *type;
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
|
|
|
|
&children) == 0) {
|
|
|
|
verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
|
|
|
|
&type) == 0);
|
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
if ((strcmp(type, VDEV_TYPE_SPARE) == 0 ||
|
|
|
|
strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) &&
|
2008-11-20 23:01:55 +03:00
|
|
|
children == 2 && child[which] == tgt)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
for (c = 0; c < children; c++)
|
|
|
|
if (is_replacing_spare(child[c], tgt, which))
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attach new_disk (fully described by nvroot) to old_disk.
|
|
|
|
* If 'replacing' is specified, the new disk will replace the old one.
|
|
|
|
*/
|
|
|
|
int
|
2020-07-03 21:05:50 +03:00
|
|
|
zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
|
|
|
|
const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
int ret;
|
|
|
|
nvlist_t *tgt;
|
2008-12-03 23:09:06 +03:00
|
|
|
boolean_t avail_spare, l2cache, islog;
|
|
|
|
uint64_t val;
|
2010-08-27 01:24:34 +04:00
|
|
|
char *newname;
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_t **child;
|
|
|
|
uint_t children;
|
|
|
|
nvlist_t *config_root;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
if (replacing)
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot replace %s with %s"), old_disk, new_disk);
|
|
|
|
else
|
|
|
|
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot attach %s to %s"), new_disk, old_disk);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
&islog)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
if (avail_spare)
|
|
|
|
return (zfs_error(hdl, EZFS_ISSPARE, msg));
|
|
|
|
|
|
|
|
if (l2cache)
|
|
|
|
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
|
|
|
|
zc.zc_cookie = replacing;
|
2020-07-03 21:05:50 +03:00
|
|
|
zc.zc_simple = rebuild;
|
|
|
|
|
|
|
|
if (rebuild &&
|
|
|
|
zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"the loaded zfs module doesn't support device rebuilds"));
|
|
|
|
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
|
|
|
|
&child, &children) != 0 || children != 1) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"new device must be a single disk"));
|
|
|
|
return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
|
|
|
|
|
2013-12-29 22:40:46 +04:00
|
|
|
if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL)
|
2008-12-03 23:09:06 +03:00
|
|
|
return (-1);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If the target is a hot spare that has been swapped in, we can only
|
|
|
|
* replace it with another hot spare.
|
|
|
|
*/
|
|
|
|
if (replacing &&
|
|
|
|
nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
|
2008-12-03 23:09:06 +03:00
|
|
|
(zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
|
|
|
|
NULL) == NULL || !avail_spare) &&
|
|
|
|
is_replacing_spare(config_root, tgt, 1)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"can only be replaced by another hot spare"));
|
2008-12-03 23:09:06 +03:00
|
|
|
free(newname);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_BADTARGET, msg));
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
free(newname);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
|
Remove GRUB restrictions
The GRUB restrictions are based around the pool's bootfs property.
Given the current situation where GRUB is not staying current with
OpenZFS pool features, having either a non-ZFS /boot or a separate
pool with limited features are pretty much the only long-term answers
for GRUB support. Only the second case matters in this context. For
the restrictions to be useful, the bootfs property would have to be set
on the boot pool, because that is where we need the restrictions, as
that is the pool that GRUB reads from. The documentation for bootfs
describes it as pointing to the root pool. That's also how it's used in
the initramfs. ZFS does not allow setting bootfs to point to a dataset
in another pool. (If it did, it'd be difficult-to-impossible to enforce
these restrictions cross-pool). Accordingly, bootfs is pretty much
useless for GRUB scenarios moving forward.
Even for users who have only one pool, the existing restrictions for
GRUB are incomplete. They don't prevent you from enabling the
unsupported checksums, for example. For that reason, I have ripped out
all the GRUB restrictions.
A little longer-term, I think extending the proposed features=portable
system to define a features=grub is a much more useful approach. The
user could set that on the boot pool at creation, and things would
Just Work.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Laager <rlaager@wiktel.com>
Closes #8627
2020-08-18 09:12:39 +03:00
|
|
|
if (ret == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
case ENOTSUP:
|
|
|
|
/*
|
|
|
|
* Can't attach to or replace this type of vdev.
|
|
|
|
*/
|
|
|
|
if (replacing) {
|
2010-08-27 01:24:34 +04:00
|
|
|
uint64_t version = zpool_get_prop_int(zhp,
|
|
|
|
ZPOOL_PROP_VERSION, NULL);
|
|
|
|
|
2020-07-03 21:05:50 +03:00
|
|
|
if (islog) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot replace a log with a spare"));
|
2020-07-03 21:05:50 +03:00
|
|
|
} else if (rebuild) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
"only mirror and dRAID vdevs support "
|
|
|
|
"sequential reconstruction"));
|
|
|
|
} else if (zpool_is_draid_spare(new_disk)) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"dRAID spares can only replace child "
|
|
|
|
"devices in their parent's dRAID vdev"));
|
2020-07-03 21:05:50 +03:00
|
|
|
} else if (version >= SPA_VERSION_MULTI_REPLACE) {
|
2010-08-27 01:24:34 +04:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"already in replacing/spare config; wait "
|
|
|
|
"for completion or use 'zpool detach'"));
|
2020-07-03 21:05:50 +03:00
|
|
|
} else {
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot replace a replacing device"));
|
2020-07-03 21:05:50 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"can only attach to mirrors and top-level "
|
|
|
|
"disks"));
|
|
|
|
}
|
|
|
|
(void) zfs_error(hdl, EZFS_BADTARGET, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EINVAL:
|
|
|
|
/*
|
|
|
|
* The new device must be a single disk.
|
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"new device must be a single disk"));
|
|
|
|
(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EBUSY:
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
|
OpenZFS 9290 - device removal reduces redundancy of mirrors
Mirrors are supposed to provide redundancy in the face of whole-disk
failure and silent damage (e.g. some data on disk is not right, but ZFS
hasn't detected the whole device as being broken). However, the current
device removal implementation bypasses some of the mirror's redundancy.
Note that in no case is incorrect data returned, but we might get a
checksum error when we should have been able to find the right data.
There are two underlying problems:
1. When we remove a mirror device, we only read one side of the mirror.
Since we can't verify the checksum, this side may be silently bad, but
the good data is on the other side of the mirror (which we didn't read).
This can cause the removal to "bake in" the busted data – all copies of
the data in the new location are the same, busted version, while we left
the good version behind.
The fix for this is to read and copy both sides of the mirror. If the
old and new vdevs are mirrors, we will read both sides of the old
mirror, and write each copy to the corresponding side of the new mirror.
(If the old and new vdevs have a different number of children, we will
do this as best as possible.) Even though we aren't verifying checksums,
this ensures that as long as there's a good copy of the data, we'll have
a good copy after the removal, even if there's silent damage to one side
of the mirror. If we're removing a mirror that has some silent damage,
we'll have exactly the same damage in the new location (assuming that
the new location is also a mirror).
2. When we read from an indirect vdev that points to a mirror vdev, we
only consider one copy of the data. This can lead to reduced effective
redundancy, because we might read a bad copy of the data from one side
of the mirror, and not retry the other, good side of the mirror.
Note that the problem is not with the removal process, but rather after
the removal has completed (having copied correct data to both sides of
the mirror), if one side of the new mirror is silently damaged, we
encounter the problem when reading the relocated data via the indirect
vdev. Also note that the problem doesn't occur when ZFS knows that one
side of the mirror is bad, e.g. when a disk entirely fails or is
offlined.
The impact is that reads (from indirect vdevs that point to mirrors) may
return a checksum error even though the good data exists on one side of
the mirror, and scrub doesn't repair all data on the mirror (if some of
it is pointed to via an indirect vdev).
The fix for this is complicated by "split blocks" - one logical block
may be split into two (or more) pieces with each piece moved to a
different new location. In this case we need to read all versions of
each split (one from each side of the mirror), and figure out which
combination of versions results in the correct checksum, and then repair
the incorrect versions.
This ensures that we supply the same redundancy whether you use device
removal or not. For example, if a mirror has small silent errors on all
of its children, we can still reconstruct the correct data, as long as
those errors are at sufficiently-separated offsets (specifically,
separated by the largest block size - default of 128KB, but up to 16MB).
Porting notes:
* A new indirect vdev check was moved from dsl_scan_needs_resilver_cb()
to dsl_scan_needs_resilver(), which was added to ZoL as part of the
sequential scrub work.
* Passed NULL for zfs_ereport_post_checksum()'s zbookmark_phys_t
parameter. The extra parameter is unique to ZoL.
* When posting indirect checksum errors the ABD can be passed directly,
zfs_ereport_post_checksum() is not yet ABD-aware in OpenZFS.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Ported-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://illumos.org/issues/9290
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/591
Closes #6900
2018-02-13 22:37:56 +03:00
|
|
|
"or device removal is in progress"),
|
2008-11-20 23:01:55 +03:00
|
|
|
new_disk);
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EOVERFLOW:
|
|
|
|
/*
|
|
|
|
* The new device is too small.
|
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"device is too small"));
|
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EDOM:
|
|
|
|
/*
|
2014-06-27 03:36:13 +04:00
|
|
|
* The new device has a different optimal sector size.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
2014-06-27 03:36:13 +04:00
|
|
|
"new device has a different optimal sector size; use the "
|
|
|
|
"option '-o ashift=N' to override the optimal size"));
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BADDEV, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ENAMETOOLONG:
|
|
|
|
/*
|
|
|
|
* The resulting top-level vdev spec won't fit in the label.
|
|
|
|
*/
|
|
|
|
(void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Detach the specified device.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
nvlist_t *tgt;
|
|
|
|
boolean_t avail_spare, l2cache;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
NULL)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
if (avail_spare)
|
|
|
|
return (zfs_error(hdl, EZFS_ISSPARE, msg));
|
|
|
|
|
|
|
|
if (l2cache)
|
|
|
|
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
|
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
|
|
|
|
case ENOTSUP:
|
|
|
|
/*
|
|
|
|
* Can't detach from this type of vdev.
|
|
|
|
*/
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
|
|
|
|
"applicable to mirror and replacing vdevs"));
|
2010-08-27 01:24:34 +04:00
|
|
|
(void) zfs_error(hdl, EZFS_BADTARGET, msg);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case EBUSY:
|
|
|
|
/*
|
|
|
|
* There are no other replicas of this device.
|
|
|
|
*/
|
|
|
|
(void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Find a mirror vdev in the source nvlist.
|
|
|
|
*
|
|
|
|
* The mchild array contains a list of disks in one of the top-level mirrors
|
|
|
|
* of the source pool. The schild array contains a list of disks that the
|
|
|
|
* user specified on the command line. We loop over the mchild array to
|
|
|
|
* see if any entry in the schild array matches.
|
|
|
|
*
|
|
|
|
* If a disk in the mchild array is found in the schild array, we return
|
|
|
|
* the index of that entry. Otherwise we return -1.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
|
|
|
|
nvlist_t **schild, uint_t schildren)
|
|
|
|
{
|
|
|
|
uint_t mc;
|
|
|
|
|
|
|
|
for (mc = 0; mc < mchildren; mc++) {
|
|
|
|
uint_t sc;
|
|
|
|
char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
|
2013-12-29 22:40:46 +04:00
|
|
|
mchild[mc], 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
for (sc = 0; sc < schildren; sc++) {
|
|
|
|
char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
|
2013-12-29 22:40:46 +04:00
|
|
|
schild[sc], 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t result = (strcmp(mpath, spath) == 0);
|
|
|
|
|
|
|
|
free(spath);
|
|
|
|
if (result) {
|
|
|
|
free(mpath);
|
|
|
|
return (mc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free(mpath);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Split a mirror pool. If newroot points to null, then a new nvlist
|
|
|
|
* is generated and it is the responsibility of the caller to free it.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
|
|
|
|
nvlist_t *props, splitflags_t flags)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2020-12-05 01:04:39 +03:00
|
|
|
char msg[1024], *bias;
|
2010-05-29 00:45:14 +04:00
|
|
|
nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
|
|
|
|
nvlist_t **varray = NULL, *zc_props = NULL;
|
|
|
|
uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2018-04-12 20:57:24 +03:00
|
|
|
uint64_t vers, readonly = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t freelist = B_FALSE, memory_err = B_TRUE;
|
|
|
|
int retval = 0;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
|
|
|
|
|
|
|
|
if (!zpool_name_valid(hdl, B_FALSE, newname))
|
|
|
|
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
|
|
|
|
|
|
|
|
if ((config = zpool_get_config(zhp, NULL)) == NULL) {
|
|
|
|
(void) fprintf(stderr, gettext("Internal error: unable to "
|
|
|
|
"retrieve pool configuration\n"));
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
|
|
|
|
== 0);
|
|
|
|
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
|
|
|
|
|
|
|
|
if (props) {
|
2010-08-27 01:24:34 +04:00
|
|
|
prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
|
2010-08-27 01:24:34 +04:00
|
|
|
props, vers, flags, msg)) == NULL)
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
2018-04-12 20:57:24 +03:00
|
|
|
(void) nvlist_lookup_uint64(zc_props,
|
|
|
|
zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
|
|
|
|
if (readonly) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"property %s can only be set at import time"),
|
|
|
|
zpool_prop_to_name(ZPOOL_PROP_READONLY));
|
|
|
|
return (-1);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
|
|
|
|
&children) != 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"Source pool is missing vdev tree"));
|
2016-04-01 06:54:07 +03:00
|
|
|
nvlist_free(zc_props);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
|
|
|
|
vcount = 0;
|
|
|
|
|
|
|
|
if (*newroot == NULL ||
|
|
|
|
nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
|
|
|
|
&newchild, &newchildren) != 0)
|
|
|
|
newchildren = 0;
|
|
|
|
|
|
|
|
for (c = 0; c < children; c++) {
|
|
|
|
uint64_t is_log = B_FALSE, is_hole = B_FALSE;
|
2020-12-05 01:04:39 +03:00
|
|
|
boolean_t is_special = B_FALSE, is_dedup = B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
char *type;
|
|
|
|
nvlist_t **mchild, *vdev;
|
|
|
|
uint_t mchildren;
|
|
|
|
int entry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlike cache & spares, slogs are stored in the
|
|
|
|
* ZPOOL_CONFIG_CHILDREN array. We filter them out here.
|
|
|
|
*/
|
|
|
|
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
|
|
|
|
&is_log);
|
|
|
|
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
|
|
|
|
&is_hole);
|
|
|
|
if (is_log || is_hole) {
|
|
|
|
/*
|
|
|
|
* Create a hole vdev and put it in the config.
|
|
|
|
*/
|
|
|
|
if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
|
|
|
|
goto out;
|
|
|
|
if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
|
|
|
|
VDEV_TYPE_HOLE) != 0)
|
|
|
|
goto out;
|
|
|
|
if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
|
|
|
|
1) != 0)
|
|
|
|
goto out;
|
|
|
|
if (lastlog == 0)
|
|
|
|
lastlog = vcount;
|
|
|
|
varray[vcount++] = vdev;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
lastlog = 0;
|
|
|
|
verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
|
|
|
|
== 0);
|
2020-05-06 20:32:28 +03:00
|
|
|
|
|
|
|
if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
|
|
|
|
vdev = child[c];
|
|
|
|
if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
|
|
|
|
goto out;
|
|
|
|
continue;
|
|
|
|
} else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"Source pool must be composed only of mirrors\n"));
|
|
|
|
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-12-05 01:04:39 +03:00
|
|
|
if (nvlist_lookup_string(child[c],
|
|
|
|
ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) {
|
|
|
|
if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
|
|
|
|
is_special = B_TRUE;
|
|
|
|
else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
|
|
|
|
is_dedup = B_TRUE;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
verify(nvlist_lookup_nvlist_array(child[c],
|
|
|
|
ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
|
|
|
|
|
|
|
|
/* find or add an entry for this top-level vdev */
|
|
|
|
if (newchildren > 0 &&
|
|
|
|
(entry = find_vdev_entry(zhp, mchild, mchildren,
|
|
|
|
newchild, newchildren)) >= 0) {
|
|
|
|
/* We found a disk that the user specified. */
|
|
|
|
vdev = mchild[entry];
|
|
|
|
++found;
|
|
|
|
} else {
|
|
|
|
/* User didn't specify a disk for this vdev. */
|
|
|
|
vdev = mchild[mchildren - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
|
|
|
|
goto out;
|
2020-12-05 01:04:39 +03:00
|
|
|
|
|
|
|
if (flags.dryrun != 0) {
|
|
|
|
if (is_dedup == B_TRUE) {
|
|
|
|
if (nvlist_add_string(varray[vcount - 1],
|
|
|
|
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
|
|
|
VDEV_ALLOC_BIAS_DEDUP) != 0)
|
|
|
|
goto out;
|
|
|
|
} else if (is_special == B_TRUE) {
|
|
|
|
if (nvlist_add_string(varray[vcount - 1],
|
|
|
|
ZPOOL_CONFIG_ALLOCATION_BIAS,
|
|
|
|
VDEV_ALLOC_BIAS_SPECIAL) != 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* did we find every disk the user specified? */
|
|
|
|
if (found != newchildren) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
|
|
|
|
"include at most one disk from each mirror"));
|
|
|
|
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Prepare the nvlist for populating. */
|
|
|
|
if (*newroot == NULL) {
|
|
|
|
if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
|
|
|
|
goto out;
|
|
|
|
freelist = B_TRUE;
|
|
|
|
if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
|
|
|
|
VDEV_TYPE_ROOT) != 0)
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Add all the children we found */
|
|
|
|
if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
|
|
|
|
lastlog == 0 ? vcount : lastlog) != 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're just doing a dry run, exit now with success.
|
|
|
|
*/
|
|
|
|
if (flags.dryrun) {
|
|
|
|
memory_err = B_FALSE;
|
|
|
|
freelist = B_FALSE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* now build up the config list & call the ioctl */
|
|
|
|
if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (nvlist_add_nvlist(newconfig,
|
|
|
|
ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
|
|
|
|
nvlist_add_string(newconfig,
|
|
|
|
ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
|
|
|
|
nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The new pool is automatically part of the namespace unless we
|
|
|
|
* explicitly export it.
|
|
|
|
*/
|
|
|
|
if (!flags.import)
|
|
|
|
zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
|
|
|
|
if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
|
|
|
|
goto out;
|
|
|
|
if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
|
|
|
|
retval = zpool_standard_error(hdl, errno, msg);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
freelist = B_FALSE;
|
|
|
|
memory_err = B_FALSE;
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (varray != NULL) {
|
|
|
|
int v;
|
|
|
|
|
|
|
|
for (v = 0; v < vcount; v++)
|
|
|
|
nvlist_free(varray[v]);
|
|
|
|
free(varray);
|
|
|
|
}
|
|
|
|
zcmd_free_nvlists(&zc);
|
2016-04-01 06:54:07 +03:00
|
|
|
nvlist_free(zc_props);
|
|
|
|
nvlist_free(newconfig);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (freelist) {
|
|
|
|
nvlist_free(*newroot);
|
|
|
|
*newroot = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (retval != 0)
|
|
|
|
return (retval);
|
|
|
|
|
|
|
|
if (memory_err)
|
|
|
|
return (no_memory(hdl));
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
* Remove the given device.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
nvlist_t *tgt;
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t avail_spare, l2cache, islog;
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t version;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
|
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
if (zpool_is_draid_spare(path)) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"dRAID spares cannot be removed"));
|
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
2008-12-03 23:09:06 +03:00
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
&islog)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
|
|
|
|
if (islog && version < SPA_VERSION_HOLES) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
"pool must be upgraded to support log removal"));
|
2010-05-29 00:45:14 +04:00
|
|
|
return (zfs_error(hdl, EZFS_BADVERSION, msg));
|
|
|
|
}
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID);
|
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
switch (errno) {
|
|
|
|
|
|
|
|
case EINVAL:
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"invalid config; all top-level vdevs must "
|
|
|
|
"have the same sector size and not be raidz."));
|
|
|
|
(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EBUSY:
|
2018-06-08 04:07:29 +03:00
|
|
|
if (islog) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"Mount encrypted datasets to replay logs."));
|
|
|
|
} else {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"Pool busy; removal may already be in progress"));
|
|
|
|
}
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
(void) zfs_error(hdl, EZFS_BUSY, msg);
|
|
|
|
break;
|
|
|
|
|
2018-06-08 04:07:29 +03:00
|
|
|
case EACCES:
|
|
|
|
if (islog) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"Mount encrypted datasets to replay logs."));
|
|
|
|
(void) zfs_error(hdl, EZFS_BUSY, msg);
|
|
|
|
} else {
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
default:
|
|
|
|
(void) zpool_standard_error(hdl, errno, msg);
|
|
|
|
}
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zpool_vdev_remove_cancel(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
zfs_cmd_t zc;
|
|
|
|
char msg[1024];
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot cancel removal"));
|
|
|
|
|
|
|
|
bzero(&zc, sizeof (zc));
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
zc.zc_cookie = 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
int
|
|
|
|
zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path,
|
|
|
|
uint64_t *sizep)
|
|
|
|
{
|
|
|
|
char msg[1024];
|
|
|
|
nvlist_t *tgt;
|
|
|
|
boolean_t avail_spare, l2cache, islog;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"),
|
|
|
|
path);
|
|
|
|
|
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
|
|
|
|
&islog)) == NULL)
|
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
if (avail_spare || l2cache || islog) {
|
|
|
|
*sizep = 0;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) {
|
|
|
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
|
|
|
"indirect size not available"));
|
|
|
|
return (zfs_error(hdl, EINVAL, msg));
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Clear the errors for the pool, or the particular device if specified.
|
|
|
|
*/
|
|
|
|
int
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
nvlist_t *tgt;
|
2017-02-11 01:51:09 +03:00
|
|
|
zpool_load_policy_t policy;
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t avail_spare, l2cache;
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2010-05-29 00:45:14 +04:00
|
|
|
nvlist_t *nvi = NULL;
|
2010-08-27 01:24:34 +04:00
|
|
|
int error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (path)
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
|
|
|
|
path);
|
|
|
|
else
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
|
|
|
|
zhp->zpool_name);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
if (path) {
|
|
|
|
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
&l2cache, NULL)) == NULL)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zfs_error(hdl, EZFS_NODEVICE, msg));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't allow error clearing for hot spares. Do allow
|
|
|
|
* error clearing for l2cache devices.
|
|
|
|
*/
|
|
|
|
if (avail_spare)
|
|
|
|
return (zfs_error(hdl, EZFS_ISSPARE, msg));
|
|
|
|
|
|
|
|
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
|
|
|
|
&zc.zc_guid) == 0);
|
|
|
|
}
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
zpool_get_load_policy(rewindnvl, &policy);
|
|
|
|
zc.zc_cookie = policy.zlp_rewind;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
return (-1);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
|
|
|
|
errno == ENOMEM) {
|
|
|
|
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-11 01:51:09 +03:00
|
|
|
if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
errno != EPERM && errno != EACCES)) {
|
2017-02-11 01:51:09 +03:00
|
|
|
if (policy.zlp_rewind &
|
2010-05-29 00:45:14 +04:00
|
|
|
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
|
|
|
|
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
|
|
|
|
zpool_rewind_exclaim(hdl, zc.zc_name,
|
2017-02-11 01:51:09 +03:00
|
|
|
((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0),
|
2010-05-29 00:45:14 +04:00
|
|
|
nvi);
|
|
|
|
nvlist_free(nvi);
|
|
|
|
}
|
|
|
|
zcmd_free_nvlists(&zc);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zcmd_free_nvlists(&zc);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Similar to zpool_clear(), but takes a GUID (used by fmd).
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
char msg[1024];
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
|
2013-11-01 23:26:11 +04:00
|
|
|
(u_longlong_t)guid);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
zc.zc_guid = guid;
|
2010-05-29 00:45:14 +04:00
|
|
|
zc.zc_cookie = ZPOOL_NO_REWIND;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
2011-11-12 02:07:54 +04:00
|
|
|
/*
|
|
|
|
* Change the GUID for a pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_reguid(zpool_handle_t *zhp)
|
|
|
|
{
|
|
|
|
char msg[1024];
|
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2011-11-12 02:07:54 +04:00
|
|
|
|
|
|
|
(void) snprintf(msg, sizeof (msg),
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (zpool_standard_error(hdl, errno, msg));
|
|
|
|
}
|
|
|
|
|
2012-01-24 06:43:32 +04:00
|
|
|
/*
|
|
|
|
* Reopen the pool.
|
|
|
|
*/
|
|
|
|
int
|
2017-10-26 22:26:09 +03:00
|
|
|
zpool_reopen_one(zpool_handle_t *zhp, void *data)
|
2012-01-24 06:43:32 +04:00
|
|
|
{
|
2017-10-26 22:26:09 +03:00
|
|
|
libzfs_handle_t *hdl = zpool_get_handle(zhp);
|
|
|
|
const char *pool_name = zpool_get_name(zhp);
|
|
|
|
boolean_t *scrub_restart = data;
|
|
|
|
int error;
|
2012-01-24 06:43:32 +04:00
|
|
|
|
2017-10-26 22:26:09 +03:00
|
|
|
error = lzc_reopen(pool_name, *scrub_restart);
|
|
|
|
if (error) {
|
|
|
|
return (zpool_standard_error_fmt(hdl, error,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), pool_name));
|
|
|
|
}
|
2012-01-24 06:43:32 +04:00
|
|
|
|
2017-10-26 22:26:09 +03:00
|
|
|
return (0);
|
2012-01-24 06:43:32 +04:00
|
|
|
}
|
|
|
|
|
2017-05-19 22:33:11 +03:00
|
|
|
/* call into libzfs_core to execute the sync IOCTL per pool */
|
|
|
|
int
|
|
|
|
zpool_sync_one(zpool_handle_t *zhp, void *data)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
libzfs_handle_t *hdl = zpool_get_handle(zhp);
|
|
|
|
const char *pool_name = zpool_get_name(zhp);
|
|
|
|
boolean_t *force = data;
|
|
|
|
nvlist_t *innvl = fnvlist_alloc();
|
|
|
|
|
|
|
|
fnvlist_add_boolean_value(innvl, "force", *force);
|
|
|
|
if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) {
|
|
|
|
nvlist_free(innvl);
|
|
|
|
return (zpool_standard_error_fmt(hdl, ret,
|
|
|
|
dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name));
|
|
|
|
}
|
|
|
|
nvlist_free(innvl);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2010-09-23 05:53:59 +04:00
|
|
|
#define PATH_BUF_LEN 64
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Given a vdev, return the name to display in iostat. If the vdev has a path,
|
|
|
|
* we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
|
|
|
|
* We also check if this is a whole disk, in which case we strip off the
|
|
|
|
* trailing 's0' slice name.
|
|
|
|
*
|
|
|
|
* This routine is also responsible for identifying when disks have been
|
|
|
|
* reconfigured in a new location. The kernel will have opened the device by
|
|
|
|
* devid, but the path will still refer to the old location. To catch this, we
|
|
|
|
* first do a path -> devid translation (which is fast for the common case). If
|
|
|
|
* the devid matches, we're done. If not, we do a reverse devid -> path
|
|
|
|
* translation and issue the appropriate ioctl() to update the path of the vdev.
|
|
|
|
* If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
|
|
|
|
* of these checks.
|
|
|
|
*/
|
|
|
|
char *
|
2010-05-29 00:45:14 +04:00
|
|
|
zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
|
2013-12-29 22:40:46 +04:00
|
|
|
int name_flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2016-03-14 19:04:21 +03:00
|
|
|
char *path, *type, *env;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t value;
|
2010-09-23 05:53:59 +04:00
|
|
|
char buf[PATH_BUF_LEN];
|
2012-09-05 20:46:29 +04:00
|
|
|
char tmpbuf[PATH_BUF_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-11-06 00:09:56 +03:00
|
|
|
/*
|
|
|
|
* vdev_name will be "root"/"root-0" for the root vdev, but it is the
|
|
|
|
* zpool name that will be displayed to the user.
|
|
|
|
*/
|
|
|
|
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
|
|
|
|
if (zhp != NULL && strcmp(type, "root") == 0)
|
|
|
|
return (zfs_strdup(hdl, zpool_get_name(zhp)));
|
|
|
|
|
2013-12-29 22:40:46 +04:00
|
|
|
env = getenv("ZPOOL_VDEV_NAME_PATH");
|
|
|
|
if (env && (strtoul(env, NULL, 0) > 0 ||
|
|
|
|
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
|
|
|
name_flags |= VDEV_NAME_PATH;
|
|
|
|
|
|
|
|
env = getenv("ZPOOL_VDEV_NAME_GUID");
|
|
|
|
if (env && (strtoul(env, NULL, 0) > 0 ||
|
|
|
|
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
|
|
|
name_flags |= VDEV_NAME_GUID;
|
|
|
|
|
|
|
|
env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS");
|
|
|
|
if (env && (strtoul(env, NULL, 0) > 0 ||
|
|
|
|
!strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2)))
|
|
|
|
name_flags |= VDEV_NAME_FOLLOW_LINKS;
|
|
|
|
|
|
|
|
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 ||
|
|
|
|
name_flags & VDEV_NAME_GUID) {
|
2016-10-02 21:24:54 +03:00
|
|
|
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value);
|
2013-12-29 22:40:46 +04:00
|
|
|
(void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value);
|
2008-11-20 23:01:55 +03:00
|
|
|
path = buf;
|
|
|
|
} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
|
2013-12-29 22:40:46 +04:00
|
|
|
if (name_flags & VDEV_NAME_FOLLOW_LINKS) {
|
|
|
|
char *rp = realpath(path, NULL);
|
|
|
|
if (rp) {
|
|
|
|
strlcpy(buf, rp, sizeof (buf));
|
|
|
|
path = buf;
|
|
|
|
free(rp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:56:53 +04:00
|
|
|
/*
|
|
|
|
* For a block device only use the name.
|
|
|
|
*/
|
2013-12-29 22:40:46 +04:00
|
|
|
if ((strcmp(type, VDEV_TYPE_DISK) == 0) &&
|
|
|
|
!(name_flags & VDEV_NAME_PATH)) {
|
2019-11-11 23:15:44 +03:00
|
|
|
path = zfs_strip_path(path);
|
2010-08-26 22:56:53 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:56:53 +04:00
|
|
|
/*
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
* Remove the partition from the path if this is a whole disk.
|
2010-08-26 22:56:53 +04:00
|
|
|
*/
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 &&
|
|
|
|
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
|
2013-12-29 22:40:46 +04:00
|
|
|
== 0 && value && !(name_flags & VDEV_NAME_PATH)) {
|
2016-10-19 22:55:59 +03:00
|
|
|
return (zfs_strip_partition(path));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
} else {
|
2017-11-06 00:09:56 +03:00
|
|
|
path = type;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If it's a raidz device, we need to stick in the parity level.
|
|
|
|
*/
|
|
|
|
if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
|
|
|
|
&value) == 0);
|
2012-09-05 20:46:29 +04:00
|
|
|
(void) snprintf(buf, sizeof (buf), "%s%llu", path,
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)value);
|
2012-09-05 20:46:29 +04:00
|
|
|
path = buf;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-14 00:51:51 +03:00
|
|
|
/*
|
|
|
|
* If it's a dRAID device, we add parity, groups, and spares.
|
|
|
|
*/
|
|
|
|
if (strcmp(path, VDEV_TYPE_DRAID) == 0) {
|
|
|
|
uint64_t ndata, nparity, nspares;
|
|
|
|
nvlist_t **child;
|
|
|
|
uint_t children;
|
|
|
|
|
|
|
|
verify(nvlist_lookup_nvlist_array(nv,
|
|
|
|
ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
|
|
|
|
verify(nvlist_lookup_uint64(nv,
|
|
|
|
ZPOOL_CONFIG_NPARITY, &nparity) == 0);
|
|
|
|
verify(nvlist_lookup_uint64(nv,
|
|
|
|
ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0);
|
|
|
|
verify(nvlist_lookup_uint64(nv,
|
|
|
|
ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0);
|
|
|
|
|
|
|
|
path = zpool_draid_name(buf, sizeof (buf), ndata,
|
|
|
|
nparity, nspares, children);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* We identify each top-level vdev by using a <type-id>
|
|
|
|
* naming convention.
|
|
|
|
*/
|
2013-12-29 22:40:46 +04:00
|
|
|
if (name_flags & VDEV_NAME_TYPE_ID) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t id;
|
|
|
|
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
|
|
|
|
&id) == 0);
|
2012-09-05 20:46:29 +04:00
|
|
|
(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu",
|
|
|
|
path, (u_longlong_t)id);
|
|
|
|
path = tmpbuf;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return (zfs_strdup(hdl, path));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2015-12-22 04:31:57 +03:00
|
|
|
zbookmark_mem_compare(const void *a, const void *b)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2014-06-25 22:37:59 +04:00
|
|
|
return (memcmp(a, b, sizeof (zbookmark_phys_t)));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the persistent error log, uniquify the members, and return to the
|
|
|
|
* caller.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2017-03-23 04:08:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t count;
|
2014-06-25 22:37:59 +04:00
|
|
|
zbookmark_phys_t *zb = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the raw error list from the kernel. If the number of errors
|
|
|
|
* has increased, allocate more space and continue until we get the
|
|
|
|
* entire list.
|
|
|
|
*/
|
|
|
|
verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
|
|
|
|
&count) == 0);
|
|
|
|
if (count == 0)
|
|
|
|
return (0);
|
2017-03-23 04:08:55 +03:00
|
|
|
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
|
|
|
|
count * sizeof (zbookmark_phys_t));
|
2008-11-20 23:01:55 +03:00
|
|
|
zc.zc_nvlist_dst_size = count;
|
|
|
|
(void) strcpy(zc.zc_name, zhp->zpool_name);
|
|
|
|
for (;;) {
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
|
2008-11-20 23:01:55 +03:00
|
|
|
&zc) != 0) {
|
|
|
|
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
|
|
|
if (errno == ENOMEM) {
|
2014-06-25 22:37:59 +04:00
|
|
|
void *dst;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
count = zc.zc_nvlist_dst_size;
|
2014-06-25 22:37:59 +04:00
|
|
|
dst = zfs_alloc(zhp->zpool_hdl, count *
|
|
|
|
sizeof (zbookmark_phys_t));
|
|
|
|
zc.zc_nvlist_dst = (uintptr_t)dst;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2017-03-23 04:08:55 +03:00
|
|
|
return (zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "errors: List of "
|
|
|
|
"errors unavailable")));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sort the resulting bookmarks. This is a little confusing due to the
|
|
|
|
* implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last
|
2019-09-03 03:53:27 +03:00
|
|
|
* to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks
|
2008-11-20 23:01:55 +03:00
|
|
|
* _not_ copied as part of the process. So we point the start of our
|
|
|
|
* array appropriate and decrement the total number of elements.
|
|
|
|
*/
|
2014-06-25 22:37:59 +04:00
|
|
|
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
|
2008-11-20 23:01:55 +03:00
|
|
|
zc.zc_nvlist_dst_size;
|
|
|
|
count -= zc.zc_nvlist_dst_size;
|
|
|
|
|
2015-12-22 04:31:57 +03:00
|
|
|
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < count; i++) {
|
|
|
|
nvlist_t *nv;
|
|
|
|
|
|
|
|
/* ignoring zb_blkid and zb_level for now */
|
|
|
|
if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
|
|
|
|
zb[i-1].zb_object == zb[i].zb_object)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
|
|
|
|
goto nomem;
|
|
|
|
if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
|
|
|
|
zb[i].zb_objset) != 0) {
|
|
|
|
nvlist_free(nv);
|
|
|
|
goto nomem;
|
|
|
|
}
|
|
|
|
if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
|
|
|
|
zb[i].zb_object) != 0) {
|
|
|
|
nvlist_free(nv);
|
|
|
|
goto nomem;
|
|
|
|
}
|
|
|
|
if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
|
|
|
|
nvlist_free(nv);
|
|
|
|
goto nomem;
|
|
|
|
}
|
|
|
|
nvlist_free(nv);
|
|
|
|
}
|
|
|
|
|
|
|
|
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
nomem:
|
|
|
|
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
|
|
|
return (no_memory(zhp->zpool_hdl));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Upgrade a ZFS pool to the latest on-disk version.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) strcpy(zc.zc_name, zhp->zpool_name);
|
|
|
|
zc.zc_cookie = new_version;
|
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
|
|
|
|
return (zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
|
|
|
|
zhp->zpool_name));
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2013-08-28 15:45:09 +04:00
|
|
|
zfs_save_arguments(int argc, char **argv, char *string, int len)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2013-08-28 15:45:09 +04:00
|
|
|
(void) strlcpy(string, basename(argv[0]), len);
|
2008-11-20 23:01:55 +03:00
|
|
|
for (i = 1; i < argc; i++) {
|
2013-08-28 15:45:09 +04:00
|
|
|
(void) strlcat(string, " ", len);
|
|
|
|
(void) strlcat(string, argv[i], len);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2013-08-28 15:45:09 +04:00
|
|
|
zpool_log_history(libzfs_handle_t *hdl, const char *message)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2013-08-28 15:45:09 +04:00
|
|
|
nvlist_t *args;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
args = fnvlist_alloc();
|
|
|
|
fnvlist_add_string(args, "message", message);
|
|
|
|
err = zcmd_write_src_nvlist(hdl, &zc, args);
|
|
|
|
if (err == 0)
|
2019-10-24 03:29:43 +03:00
|
|
|
err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc);
|
2013-08-28 15:45:09 +04:00
|
|
|
nvlist_free(args);
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
return (err);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform ioctl to get some command history of a pool.
|
|
|
|
*
|
|
|
|
* 'buf' is the buffer to fill up to 'len' bytes. 'off' is the
|
|
|
|
* logical offset of the history buffer to start reading from.
|
|
|
|
*
|
|
|
|
* Upon return, 'off' is the next logical offset to read from and
|
|
|
|
* 'len' is the actual amount of bytes read into 'buf'.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
|
|
|
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
|
|
|
|
zc.zc_history = (uint64_t)(uintptr_t)buf;
|
|
|
|
zc.zc_history_len = *len;
|
|
|
|
zc.zc_history_offset = *off;
|
|
|
|
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
switch (errno) {
|
|
|
|
case EPERM:
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_PERM,
|
|
|
|
dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot show history for pool '%s'"),
|
|
|
|
zhp->zpool_name));
|
|
|
|
case ENOENT:
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get history for pool "
|
|
|
|
"'%s'"), zhp->zpool_name));
|
|
|
|
case ENOTSUP:
|
|
|
|
return (zfs_error_fmt(hdl, EZFS_BADVERSION,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get history for pool "
|
|
|
|
"'%s', pool must be upgraded"), zhp->zpool_name));
|
|
|
|
default:
|
|
|
|
return (zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN,
|
|
|
|
"cannot get history for '%s'"), zhp->zpool_name));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*len = zc.zc_history_len;
|
|
|
|
*off = zc.zc_history_offset;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the command history of a pool.
|
|
|
|
*/
|
|
|
|
int
|
2019-10-28 19:49:44 +03:00
|
|
|
zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off,
|
|
|
|
boolean_t *eof)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2014-09-17 19:41:51 +04:00
|
|
|
char *buf;
|
|
|
|
int buflen = 128 * 1024;
|
2008-11-20 23:01:55 +03:00
|
|
|
nvlist_t **records = NULL;
|
|
|
|
uint_t numrecords = 0;
|
|
|
|
int err, i;
|
2019-10-28 19:49:44 +03:00
|
|
|
uint64_t start = *off;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-09-17 19:41:51 +04:00
|
|
|
buf = malloc(buflen);
|
|
|
|
if (buf == NULL)
|
|
|
|
return (ENOMEM);
|
2019-10-28 19:49:44 +03:00
|
|
|
/* process about 1MB a time */
|
|
|
|
while (*off - start < 1024 * 1024) {
|
2014-09-17 19:41:51 +04:00
|
|
|
uint64_t bytes_read = buflen;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t leftover;
|
|
|
|
|
2019-10-28 19:49:44 +03:00
|
|
|
if ((err = get_history(zhp, buf, off, &bytes_read)) != 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* if nothing else was read in, we're at EOF, just return */
|
2019-10-28 19:49:44 +03:00
|
|
|
if (!bytes_read) {
|
|
|
|
*eof = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2019-10-28 19:49:44 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if ((err = zpool_history_unpack(buf, bytes_read,
|
|
|
|
&leftover, &records, &numrecords)) != 0)
|
|
|
|
break;
|
2019-10-28 19:49:44 +03:00
|
|
|
*off -= leftover;
|
2014-09-17 19:41:51 +04:00
|
|
|
if (leftover == bytes_read) {
|
|
|
|
/*
|
|
|
|
* no progress made, because buffer is not big enough
|
|
|
|
* to hold this record; resize and retry.
|
|
|
|
*/
|
|
|
|
buflen *= 2;
|
|
|
|
free(buf);
|
|
|
|
buf = malloc(buflen);
|
|
|
|
if (buf == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
2019-10-28 19:49:44 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-09-17 19:41:51 +04:00
|
|
|
free(buf);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (!err) {
|
|
|
|
verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
|
|
|
|
verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
|
|
|
|
records, numrecords) == 0);
|
|
|
|
}
|
|
|
|
for (i = 0; i < numrecords; i++)
|
|
|
|
nvlist_free(records[i]);
|
|
|
|
free(records);
|
|
|
|
|
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
2010-08-26 22:42:43 +04:00
|
|
|
/*
|
2013-11-23 04:00:39 +04:00
|
|
|
* Retrieve the next event given the passed 'zevent_fd' file descriptor.
|
|
|
|
* If there is a new event available 'nvp' will contain a newly allocated
|
|
|
|
* nvlist and 'dropped' will be set to the number of missed events since
|
|
|
|
* the last call to this function. When 'nvp' is set to NULL it indicates
|
|
|
|
* no new events are available. In either case the function returns 0 and
|
|
|
|
* it is up to the caller to free 'nvp'. In the case of a fatal error the
|
|
|
|
* function will return a non-zero value. When the function is called in
|
2014-02-12 22:30:18 +04:00
|
|
|
* blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed),
|
|
|
|
* it will not return until a new event is available.
|
2010-08-26 22:42:43 +04:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp,
|
2014-02-12 22:30:18 +04:00
|
|
|
int *dropped, unsigned flags, int zevent_fd)
|
2010-08-26 22:42:43 +04:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2010-08-26 22:42:43 +04:00
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
*nvp = NULL;
|
|
|
|
*dropped = 0;
|
2013-11-23 04:00:39 +04:00
|
|
|
zc.zc_cleanup_fd = zevent_fd;
|
2010-08-26 22:42:43 +04:00
|
|
|
|
2014-02-12 22:30:18 +04:00
|
|
|
if (flags & ZEVENT_NONBLOCK)
|
2010-08-26 22:42:43 +04:00
|
|
|
zc.zc_guid = ZEVENT_NONBLOCK;
|
|
|
|
|
|
|
|
if (zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE) != 0)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
retry:
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) {
|
|
|
|
switch (errno) {
|
|
|
|
case ESHUTDOWN:
|
|
|
|
error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
|
|
|
|
dgettext(TEXT_DOMAIN, "zfs shutdown"));
|
|
|
|
goto out;
|
|
|
|
case ENOENT:
|
|
|
|
/* Blocking error case should not occur */
|
2014-02-12 22:30:18 +04:00
|
|
|
if (!(flags & ZEVENT_NONBLOCK))
|
2010-08-26 22:42:43 +04:00
|
|
|
error = zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
case ENOMEM:
|
|
|
|
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
|
|
|
|
error = zfs_error_fmt(hdl, EZFS_NOMEM,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
error = zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
error = zcmd_read_dst_nvlist(hdl, &zc, nvp);
|
|
|
|
if (error != 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
*dropped = (int)zc.zc_cookie;
|
|
|
|
out:
|
|
|
|
zcmd_free_nvlists(&zc);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear all events.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_events_clear(libzfs_handle_t *hdl, int *count)
|
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2010-08-26 22:42:43 +04:00
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0)
|
2021-05-15 13:23:45 +03:00
|
|
|
return (zpool_standard_error(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot clear events")));
|
2010-08-26 22:42:43 +04:00
|
|
|
|
|
|
|
if (count != NULL)
|
|
|
|
*count = (int)zc.zc_cookie; /* # of events cleared */
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2013-11-23 02:52:16 +04:00
|
|
|
/*
|
|
|
|
* Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for
|
|
|
|
* the passed zevent_fd file handle. On success zero is returned,
|
|
|
|
* otherwise -1 is returned and hdl->libzfs_error is set to the errno.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd)
|
|
|
|
{
|
|
|
|
zfs_cmd_t zc = {"\0"};
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
zc.zc_guid = eid;
|
|
|
|
zc.zc_cleanup_fd = zevent_fd;
|
|
|
|
|
|
|
|
if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) {
|
|
|
|
switch (errno) {
|
|
|
|
case ENOENT:
|
|
|
|
error = zfs_error_fmt(hdl, EZFS_NOENT,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ENOMEM:
|
|
|
|
error = zfs_error_fmt(hdl, EZFS_NOMEM,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
error = zpool_standard_error_fmt(hdl, errno,
|
|
|
|
dgettext(TEXT_DOMAIN, "cannot get event"));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2020-05-20 20:05:33 +03:00
|
|
|
static void
|
|
|
|
zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
|
|
|
|
char *pathname, size_t len, boolean_t always_unmounted)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-09-04 16:00:57 +04:00
|
|
|
zfs_cmd_t zc = {"\0"};
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t mounted = B_FALSE;
|
|
|
|
char *mntpnt = NULL;
|
2016-06-16 00:28:36 +03:00
|
|
|
char dsname[ZFS_MAX_DATASET_NAME_LEN];
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (dsobj == 0) {
|
|
|
|
/* special case for the MOS */
|
2013-11-01 23:26:11 +04:00
|
|
|
(void) snprintf(pathname, len, "<metadata>:<0x%llx>",
|
|
|
|
(longlong_t)obj);
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the dataset's name */
|
|
|
|
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
|
|
|
|
zc.zc_obj = dsobj;
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(zhp->zpool_hdl,
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
|
|
|
|
/* just write out a path of two object numbers */
|
|
|
|
(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
|
2010-08-26 20:52:39 +04:00
|
|
|
(longlong_t)dsobj, (longlong_t)obj);
|
2008-11-20 23:01:55 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
|
|
|
|
|
|
|
|
/* find out if the dataset is mounted */
|
2020-05-20 20:05:33 +03:00
|
|
|
mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname,
|
|
|
|
&mntpnt);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/* get the corrupted object's path */
|
|
|
|
(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
|
|
|
|
zc.zc_obj = obj;
|
2019-10-24 03:29:43 +03:00
|
|
|
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH,
|
2008-11-20 23:01:55 +03:00
|
|
|
&zc) == 0) {
|
|
|
|
if (mounted) {
|
|
|
|
(void) snprintf(pathname, len, "%s%s", mntpnt,
|
|
|
|
zc.zc_value);
|
|
|
|
} else {
|
|
|
|
(void) snprintf(pathname, len, "%s:%s",
|
|
|
|
dsname, zc.zc_value);
|
|
|
|
}
|
|
|
|
} else {
|
2013-11-01 23:26:11 +04:00
|
|
|
(void) snprintf(pathname, len, "%s:<0x%llx>", dsname,
|
|
|
|
(longlong_t)obj);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
free(mntpnt);
|
|
|
|
}
|
|
|
|
|
2020-05-20 20:05:33 +03:00
|
|
|
void
|
|
|
|
zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
|
|
|
|
char *pathname, size_t len)
|
|
|
|
{
|
|
|
|
zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
|
|
|
|
char *pathname, size_t len)
|
|
|
|
{
|
|
|
|
zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE);
|
|
|
|
}
|
Add subcommand to wait for background zfs activity to complete
Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.
This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:
- Scrubs or resilvers to complete
- Devices to initialized
- Devices to be replaced
- Devices to be removed
- Checkpoints to be discarded
- Background freeing to complete
For example, a scrub that is in progress could be waited for by running
zpool wait -t scrub <pool>
This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.
This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.
Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:
- Added ZoL-style ioctl input declaration.
- Reorganized error handling in zpool_initialize in libzfs to integrate
better with changes made for TRIM support.
- Fixed check for whether a checkpoint discard is in progress.
Previously it also waited if the pool had a checkpoint, instead of
just if a checkpoint was being discarded.
- Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
- Updated more existing tests to make use of new 'zpool wait'
functionality, tests that don't exist in Delphix OS.
- Used existing ZoL tunable zfs_scan_suspend_progress, together with
zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
- Added support for a non-integral interval argument to zpool wait.
Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
Closes #9162
2019-09-14 04:09:06 +03:00
|
|
|
/*
|
|
|
|
* Wait while the specified activity is in progress in the pool.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity)
|
|
|
|
{
|
|
|
|
boolean_t missing;
|
|
|
|
|
|
|
|
int error = zpool_wait_status(zhp, activity, &missing, NULL);
|
|
|
|
|
|
|
|
if (missing) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT,
|
|
|
|
dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"),
|
|
|
|
zhp->zpool_name);
|
|
|
|
return (ENOENT);
|
|
|
|
} else {
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for the given activity and return the status of the wait (whether or not
|
|
|
|
* any waiting was done) in the 'waited' parameter. Non-existent pools are
|
|
|
|
* reported via the 'missing' parameter, rather than by printing an error
|
|
|
|
* message. This is convenient when this function is called in a loop over a
|
|
|
|
* long period of time (as it is, for example, by zpool's wait cmd). In that
|
|
|
|
* scenario, a pool being exported or destroyed should be considered a normal
|
|
|
|
* event, so we don't want to print an error when we find that the pool doesn't
|
|
|
|
* exist.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity,
|
|
|
|
boolean_t *missing, boolean_t *waited)
|
|
|
|
{
|
|
|
|
int error = lzc_wait(zhp->zpool_name, activity, waited);
|
|
|
|
*missing = (error == ENOENT);
|
|
|
|
if (*missing)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (error != 0) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
|
|
|
|
dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"),
|
|
|
|
zhp->zpool_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
2020-05-07 19:36:33 +03:00
|
|
|
|
|
|
|
int
|
2020-09-16 01:42:27 +03:00
|
|
|
zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap)
|
2020-05-07 19:36:33 +03:00
|
|
|
{
|
|
|
|
int error = lzc_set_bootenv(zhp->zpool_name, envmap);
|
|
|
|
if (error != 0) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
|
|
|
|
dgettext(TEXT_DOMAIN,
|
|
|
|
"error setting bootenv in pool '%s'"), zhp->zpool_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2020-09-16 01:42:27 +03:00
|
|
|
zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp)
|
2020-05-07 19:36:33 +03:00
|
|
|
{
|
2020-09-16 01:42:27 +03:00
|
|
|
nvlist_t *nvl;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
nvl = NULL;
|
|
|
|
error = lzc_get_bootenv(zhp->zpool_name, &nvl);
|
2020-05-07 19:36:33 +03:00
|
|
|
if (error != 0) {
|
|
|
|
(void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
|
|
|
|
dgettext(TEXT_DOMAIN,
|
|
|
|
"error getting bootenv in pool '%s'"), zhp->zpool_name);
|
2020-09-16 01:42:27 +03:00
|
|
|
} else {
|
|
|
|
*nvlp = nvl;
|
2020-05-07 19:36:33 +03:00
|
|
|
}
|
|
|
|
|
2020-09-16 01:42:27 +03:00
|
|
|
return (error);
|
2020-05-07 19:36:33 +03:00
|
|
|
}
|
2021-02-18 08:30:45 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to read and parse feature file(s) (from "compatibility" property).
|
|
|
|
* Files contain zpool feature names, comma or whitespace-separated.
|
|
|
|
* Comments (# character to next newline) are discarded.
|
|
|
|
*
|
|
|
|
* Arguments:
|
|
|
|
* compatibility : string containing feature filenames
|
|
|
|
* features : either NULL or pointer to array of boolean
|
2021-04-12 19:08:56 +03:00
|
|
|
* report : either NULL or pointer to string buffer
|
|
|
|
* rlen : length of "report" buffer
|
2021-02-18 08:30:45 +03:00
|
|
|
*
|
|
|
|
* compatibility is NULL (unset), "", "off", "legacy", or list of
|
|
|
|
* comma-separated filenames. filenames should either be absolute,
|
|
|
|
* or relative to:
|
|
|
|
* 1) ZPOOL_SYSCONF_COMPAT_D (eg: /etc/zfs/compatibility.d) or
|
|
|
|
* 2) ZPOOL_DATA_COMPAT_D (eg: /usr/share/zfs/compatibility.d).
|
|
|
|
* (Unset), "" or "off" => enable all features
|
|
|
|
* "legacy" => disable all features
|
2021-04-12 19:08:56 +03:00
|
|
|
*
|
2021-02-18 08:30:45 +03:00
|
|
|
* Any feature names read from files which match unames in spa_feature_table
|
|
|
|
* will have the corresponding boolean set in the features array (if non-NULL).
|
|
|
|
* If more than one feature set specified, only features present in *all* of
|
|
|
|
* them will be set.
|
|
|
|
*
|
2021-04-12 19:08:56 +03:00
|
|
|
* "report" if not NULL will be populated with a suitable status message.
|
2021-02-18 08:30:45 +03:00
|
|
|
*
|
|
|
|
* Return values:
|
|
|
|
* ZPOOL_COMPATIBILITY_OK : files read and parsed ok
|
|
|
|
* ZPOOL_COMPATIBILITY_BADFILE : file too big or not a text file
|
2021-04-12 19:08:56 +03:00
|
|
|
* ZPOOL_COMPATIBILITY_BADTOKEN : SYSCONF file contains invalid feature name
|
|
|
|
* ZPOOL_COMPATIBILITY_WARNTOKEN : DATA file contains invalid feature name
|
|
|
|
* ZPOOL_COMPATIBILITY_NOFILES : no feature files found
|
2021-02-18 08:30:45 +03:00
|
|
|
*/
|
|
|
|
zpool_compat_status_t
|
2021-04-12 19:08:56 +03:00
|
|
|
zpool_load_compat(const char *compat, boolean_t *features, char *report,
|
|
|
|
size_t rlen)
|
2021-02-18 08:30:45 +03:00
|
|
|
{
|
|
|
|
int sdirfd, ddirfd, featfd;
|
|
|
|
struct stat fs;
|
2021-04-12 19:08:56 +03:00
|
|
|
char *fc;
|
|
|
|
char *ps, *ls, *ws;
|
2021-02-18 08:30:45 +03:00
|
|
|
char *file, *line, *word;
|
2021-04-12 19:08:56 +03:00
|
|
|
|
|
|
|
char l_compat[ZFS_MAXPROPLEN];
|
|
|
|
|
|
|
|
boolean_t ret_nofiles = B_TRUE;
|
|
|
|
boolean_t ret_badfile = B_FALSE;
|
|
|
|
boolean_t ret_badtoken = B_FALSE;
|
|
|
|
boolean_t ret_warntoken = B_FALSE;
|
2021-02-18 08:30:45 +03:00
|
|
|
|
|
|
|
/* special cases (unset), "" and "off" => enable all features */
|
2021-04-12 19:08:56 +03:00
|
|
|
if (compat == NULL || compat[0] == '\0' ||
|
|
|
|
strcmp(compat, ZPOOL_COMPAT_OFF) == 0) {
|
2021-02-18 08:30:45 +03:00
|
|
|
if (features != NULL)
|
2021-04-12 19:08:56 +03:00
|
|
|
for (uint_t i = 0; i < SPA_FEATURES; i++)
|
2021-02-18 08:30:45 +03:00
|
|
|
features[i] = B_TRUE;
|
2021-04-12 19:08:56 +03:00
|
|
|
if (report != NULL)
|
|
|
|
strlcpy(report, gettext("all features enabled"), rlen);
|
2021-02-18 08:30:45 +03:00
|
|
|
return (ZPOOL_COMPATIBILITY_OK);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Final special case "legacy" => disable all features */
|
2021-04-12 19:08:56 +03:00
|
|
|
if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) {
|
2021-02-18 08:30:45 +03:00
|
|
|
if (features != NULL)
|
2021-04-12 19:08:56 +03:00
|
|
|
for (uint_t i = 0; i < SPA_FEATURES; i++)
|
2021-02-18 08:30:45 +03:00
|
|
|
features[i] = B_FALSE;
|
2021-04-12 19:08:56 +03:00
|
|
|
if (report != NULL)
|
|
|
|
strlcpy(report, gettext("all features disabled"), rlen);
|
2021-02-18 08:30:45 +03:00
|
|
|
return (ZPOOL_COMPATIBILITY_OK);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Start with all true; will be ANDed with results from each file
|
|
|
|
*/
|
|
|
|
if (features != NULL)
|
2021-04-12 19:08:56 +03:00
|
|
|
for (uint_t i = 0; i < SPA_FEATURES; i++)
|
2021-02-18 08:30:45 +03:00
|
|
|
features[i] = B_TRUE;
|
|
|
|
|
2022-09-12 22:54:43 +03:00
|
|
|
char err_badfile[ZFS_MAXPROPLEN] = "";
|
|
|
|
char err_badtoken[ZFS_MAXPROPLEN] = "";
|
2021-04-12 19:08:56 +03:00
|
|
|
|
2021-02-18 08:30:45 +03:00
|
|
|
/*
|
|
|
|
* We ignore errors from the directory open()
|
|
|
|
* as they're only needed if the filename is relative
|
|
|
|
* which will be checked during the openat().
|
|
|
|
*/
|
2021-06-03 18:13:42 +03:00
|
|
|
|
|
|
|
/* O_PATH safer than O_RDONLY if system allows it */
|
|
|
|
#if defined(O_PATH)
|
|
|
|
#define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_PATH)
|
|
|
|
#else
|
|
|
|
#define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_RDONLY)
|
2021-02-18 08:30:45 +03:00
|
|
|
#endif
|
2021-06-03 18:13:42 +03:00
|
|
|
|
|
|
|
sdirfd = open(ZPOOL_SYSCONF_COMPAT_D, ZC_DIR_FLAGS);
|
|
|
|
ddirfd = open(ZPOOL_DATA_COMPAT_D, ZC_DIR_FLAGS);
|
2021-02-18 08:30:45 +03:00
|
|
|
|
2021-04-12 19:08:56 +03:00
|
|
|
(void) strlcpy(l_compat, compat, ZFS_MAXPROPLEN);
|
|
|
|
|
|
|
|
for (file = strtok_r(l_compat, ",", &ps);
|
|
|
|
file != NULL;
|
|
|
|
file = strtok_r(NULL, ",", &ps)) {
|
|
|
|
|
2021-06-03 18:13:42 +03:00
|
|
|
boolean_t l_features[SPA_FEATURES];
|
2021-04-12 19:08:56 +03:00
|
|
|
|
|
|
|
enum { Z_SYSCONF, Z_DATA } source;
|
2021-02-18 08:30:45 +03:00
|
|
|
|
|
|
|
/* try sysconfdir first, then datadir */
|
2021-04-12 19:08:56 +03:00
|
|
|
source = Z_SYSCONF;
|
2021-04-29 23:21:41 +03:00
|
|
|
if ((featfd = openat(sdirfd, file, O_RDONLY | O_CLOEXEC)) < 0) {
|
|
|
|
featfd = openat(ddirfd, file, O_RDONLY | O_CLOEXEC);
|
2021-04-12 19:08:56 +03:00
|
|
|
source = Z_DATA;
|
2021-02-18 08:30:45 +03:00
|
|
|
}
|
|
|
|
|
2021-04-12 19:08:56 +03:00
|
|
|
/* File readable and correct size? */
|
|
|
|
if (featfd < 0 ||
|
|
|
|
fstat(featfd, &fs) < 0 ||
|
|
|
|
fs.st_size < 1 ||
|
|
|
|
fs.st_size > ZPOOL_COMPAT_MAXSIZE) {
|
2021-02-18 08:30:45 +03:00
|
|
|
(void) close(featfd);
|
2021-04-12 19:08:56 +03:00
|
|
|
strlcat(err_badfile, file, ZFS_MAXPROPLEN);
|
|
|
|
strlcat(err_badfile, " ", ZFS_MAXPROPLEN);
|
|
|
|
ret_badfile = B_TRUE;
|
|
|
|
continue;
|
2021-02-18 08:30:45 +03:00
|
|
|
}
|
|
|
|
|
2021-06-03 18:13:42 +03:00
|
|
|
/* Prefault the file if system allows */
|
|
|
|
#if defined(MAP_POPULATE)
|
|
|
|
#define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
|
|
|
|
#elif defined(MAP_PREFAULT_READ)
|
|
|
|
#define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_PREFAULT_READ)
|
|
|
|
#else
|
|
|
|
#define ZC_MMAP_FLAGS (MAP_PRIVATE)
|
2021-04-29 23:21:41 +03:00
|
|
|
#endif
|
2021-06-03 18:13:42 +03:00
|
|
|
|
2021-02-18 08:30:45 +03:00
|
|
|
/* private mmap() so we can strtok safely */
|
2021-06-03 18:13:42 +03:00
|
|
|
fc = (char *)mmap(NULL, fs.st_size, PROT_READ | PROT_WRITE,
|
|
|
|
ZC_MMAP_FLAGS, featfd, 0);
|
2021-02-18 08:30:45 +03:00
|
|
|
(void) close(featfd);
|
|
|
|
|
2021-04-12 19:08:56 +03:00
|
|
|
/* map ok, and last character == newline? */
|
2021-04-29 23:21:41 +03:00
|
|
|
if (fc == MAP_FAILED || fc[fs.st_size - 1] != '\n') {
|
2021-02-18 08:30:45 +03:00
|
|
|
(void) munmap((void *) fc, fs.st_size);
|
2021-04-12 19:08:56 +03:00
|
|
|
strlcat(err_badfile, file, ZFS_MAXPROPLEN);
|
|
|
|
strlcat(err_badfile, " ", ZFS_MAXPROPLEN);
|
|
|
|
ret_badfile = B_TRUE;
|
|
|
|
continue;
|
2021-02-18 08:30:45 +03:00
|
|
|
}
|
|
|
|
|
2021-04-12 19:08:56 +03:00
|
|
|
ret_nofiles = B_FALSE;
|
2021-02-18 08:30:45 +03:00
|
|
|
|
2021-06-03 18:13:42 +03:00
|
|
|
for (uint_t i = 0; i < SPA_FEATURES; i++)
|
|
|
|
l_features[i] = B_FALSE;
|
|
|
|
|
|
|
|
/* replace final newline with NULL to ensure string ends */
|
2021-04-12 19:08:56 +03:00
|
|
|
fc[fs.st_size - 1] = '\0';
|
|
|
|
|
|
|
|
for (line = strtok_r(fc, "\n", &ls);
|
|
|
|
line != NULL;
|
|
|
|
line = strtok_r(NULL, "\n", &ls)) {
|
2021-02-18 08:30:45 +03:00
|
|
|
/* discard comments */
|
2021-07-08 06:08:13 +03:00
|
|
|
char *r = strchr(line, '#');
|
|
|
|
if (r != NULL)
|
|
|
|
*r = '\0';
|
2021-02-18 08:30:45 +03:00
|
|
|
|
2021-04-12 19:08:56 +03:00
|
|
|
for (word = strtok_r(line, ", \t", &ws);
|
|
|
|
word != NULL;
|
|
|
|
word = strtok_r(NULL, ", \t", &ws)) {
|
2021-02-18 08:30:45 +03:00
|
|
|
/* Find matching feature name */
|
2021-04-12 19:08:56 +03:00
|
|
|
uint_t f;
|
|
|
|
for (f = 0; f < SPA_FEATURES; f++) {
|
2021-02-18 08:30:45 +03:00
|
|
|
zfeature_info_t *fi =
|
2021-04-12 19:08:56 +03:00
|
|
|
&spa_feature_table[f];
|
2021-02-18 08:30:45 +03:00
|
|
|
if (strcmp(word, fi->fi_uname) == 0) {
|
2021-04-12 19:08:56 +03:00
|
|
|
l_features[f] = B_TRUE;
|
2021-02-18 08:30:45 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-04-12 19:08:56 +03:00
|
|
|
if (f < SPA_FEATURES)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* found an unrecognized word */
|
|
|
|
/* lightly sanitize it */
|
|
|
|
if (strlen(word) > 32)
|
|
|
|
word[32] = '\0';
|
|
|
|
for (char *c = word; *c != '\0'; c++)
|
|
|
|
if (!isprint(*c))
|
|
|
|
*c = '?';
|
|
|
|
|
|
|
|
strlcat(err_badtoken, word, ZFS_MAXPROPLEN);
|
|
|
|
strlcat(err_badtoken, " ", ZFS_MAXPROPLEN);
|
|
|
|
if (source == Z_SYSCONF)
|
|
|
|
ret_badtoken = B_TRUE;
|
|
|
|
else
|
|
|
|
ret_warntoken = B_TRUE;
|
2021-02-18 08:30:45 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
(void) munmap((void *) fc, fs.st_size);
|
2021-04-12 19:08:56 +03:00
|
|
|
|
|
|
|
if (features != NULL)
|
|
|
|
for (uint_t i = 0; i < SPA_FEATURES; i++)
|
|
|
|
features[i] &= l_features[i];
|
2021-02-18 08:30:45 +03:00
|
|
|
}
|
|
|
|
(void) close(sdirfd);
|
|
|
|
(void) close(ddirfd);
|
2021-04-12 19:08:56 +03:00
|
|
|
|
|
|
|
/* Return the most serious error */
|
|
|
|
if (ret_badfile) {
|
|
|
|
if (report != NULL)
|
|
|
|
snprintf(report, rlen, gettext("could not read/"
|
|
|
|
"parse feature file(s): %s"), err_badfile);
|
|
|
|
return (ZPOOL_COMPATIBILITY_BADFILE);
|
|
|
|
}
|
|
|
|
if (ret_nofiles) {
|
|
|
|
if (report != NULL)
|
|
|
|
strlcpy(report,
|
|
|
|
gettext("no valid compatibility files specified"),
|
|
|
|
rlen);
|
2021-02-18 08:30:45 +03:00
|
|
|
return (ZPOOL_COMPATIBILITY_NOFILES);
|
2021-04-12 19:08:56 +03:00
|
|
|
}
|
|
|
|
if (ret_badtoken) {
|
|
|
|
if (report != NULL)
|
|
|
|
snprintf(report, rlen, gettext("invalid feature "
|
|
|
|
"name(s) in local compatibility files: %s"),
|
|
|
|
err_badtoken);
|
|
|
|
return (ZPOOL_COMPATIBILITY_BADTOKEN);
|
|
|
|
}
|
|
|
|
if (ret_warntoken) {
|
|
|
|
if (report != NULL)
|
|
|
|
snprintf(report, rlen, gettext("unrecognized feature "
|
|
|
|
"name(s) in distribution compatibility files: %s"),
|
|
|
|
err_badtoken);
|
|
|
|
return (ZPOOL_COMPATIBILITY_WARNTOKEN);
|
|
|
|
}
|
|
|
|
if (report != NULL)
|
|
|
|
strlcpy(report, gettext("compatibility set ok"), rlen);
|
2021-02-18 08:30:45 +03:00
|
|
|
return (ZPOOL_COMPATIBILITY_OK);
|
|
|
|
}
|