Merge branch 'ashift'

This branch adds some overdue ashift improvements.

  * Add '-o ashift' to 'zpool add' and 'zpool attach'
  * Improve AF hard disk detection
  * Allow 'zpool import' to handle increases in ashift

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
Brian Behlendorf 2012-11-15 11:43:54 -08:00
commit 54602c3771
8 changed files with 144 additions and 23 deletions

View File

@ -24,6 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 by Cyril Plisko. All rights reserved.
*/
#include <assert.h>
@ -199,10 +200,11 @@ static const char *
get_usage(zpool_help_t idx) {
switch (idx) {
case HELP_ADD:
return (gettext("\tadd [-fn] <pool> <vdev> ...\n"));
return (gettext("\tadd [-fn] [-o property=value] "
"<pool> <vdev> ...\n"));
case HELP_ATTACH:
return (gettext("\tattach [-f] <pool> <device> "
"<new-device>\n"));
return (gettext("\tattach [-f] [-o property=value] "
"<pool> <device> <new-device>\n"));
case HELP_CLEAR:
return (gettext("\tclear [-nF] <pool> [device]\n"));
case HELP_CREATE:
@ -436,11 +438,12 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
}
/*
* zpool add [-fn] <pool> <vdev> ...
* zpool add [-fn] [-o property=value] <pool> <vdev> ...
*
* -f Force addition of devices, even if they appear in use
* -n Do not add the devices, but display the resulting layout if
* they were to be added.
* -o Set property=value.
*
* Adds the given vdevs to 'pool'. As with create, the bulk of this work is
* handled by get_vdev_spec(), which constructs the nvlist needed to pass to
@ -457,9 +460,11 @@ zpool_do_add(int argc, char **argv)
int ret;
zpool_handle_t *zhp;
nvlist_t *config;
nvlist_t *props = NULL;
char *propval;
/* check options */
while ((c = getopt(argc, argv, "fn")) != -1) {
while ((c = getopt(argc, argv, "fno:")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
@ -467,6 +472,19 @@ zpool_do_add(int argc, char **argv)
case 'n':
dryrun = B_TRUE;
break;
case 'o':
if ((propval = strchr(optarg, '=')) == NULL) {
(void) fprintf(stderr, gettext("missing "
"'=' for -o option\n"));
usage(B_FALSE);
}
*propval = '\0';
propval++;
if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
(add_prop_list(optarg, propval, &props, B_TRUE)))
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -503,7 +521,7 @@ zpool_do_add(int argc, char **argv)
}
/* pass off to get_vdev_spec for processing */
nvroot = make_root_vdev(zhp, NULL, force, !force, B_FALSE, dryrun,
nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
argc, argv);
if (nvroot == NULL) {
zpool_close(zhp);
@ -536,6 +554,7 @@ zpool_do_add(int argc, char **argv)
ret = (zpool_add(zhp, nvroot) != 0);
}
nvlist_free(props);
nvlist_free(nvroot);
zpool_close(zhp);
@ -2865,6 +2884,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
nvlist_t *nvroot;
char *poolname, *old_disk, *new_disk;
zpool_handle_t *zhp;
nvlist_t *props = NULL;
char *propval;
int ret;
/* check options */
@ -2873,6 +2894,19 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
case 'f':
force = B_TRUE;
break;
case 'o':
if ((propval = strchr(optarg, '=')) == NULL) {
(void) fprintf(stderr, gettext("missing "
"'=' for -o option\n"));
usage(B_FALSE);
}
*propval = '\0';
propval++;
if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
(add_prop_list(optarg, propval, &props, B_TRUE)))
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -2929,7 +2963,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
return (1);
}
nvroot = make_root_vdev(zhp, NULL, force, B_FALSE, replacing, B_FALSE,
nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
argc, argv);
if (nvroot == NULL) {
zpool_close(zhp);
@ -2959,9 +2993,10 @@ zpool_do_replace(int argc, char **argv)
}
/*
* zpool attach [-f] <pool> <device> <new_device>
* zpool attach [-f] [-o property=value] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
* -o Set property=value.
*
* Attach <new_device> to the mirror containing <device>. If <device> is not
* part of a mirror, then <device> will be transformed into a mirror of
@ -3736,7 +3771,7 @@ print_dedup_stats(nvlist_t *config)
/*
* If the pool was faulted then we may not have been able to
* obtain the config. Otherwise, if have anything in the dedup
* obtain the config. Otherwise, if we have anything in the dedup
* table continue processing the stats.
*/
if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,

View File

@ -0,0 +1,39 @@
dnl #
dnl # 2.6.30 API change
dnl #
dnl # The bdev_physical_block_size() interface was added to provide a way
dnl # to determine the smallest write which can be performed without a
dnl # read-modify-write operation. From the kernel documentation:
dnl #
dnl # What: /sys/block/<disk>/queue/physical_block_size
dnl # Date: May 2009
dnl # Contact: Martin K. Petersen <martin.petersen@oracle.com>
dnl # Description:
dnl # This is the smallest unit the storage device can write
dnl # without resorting to read-modify-write operation. It is
dnl # usually the same as the logical block size but may be
dnl # bigger. One example is SATA drives with 4KB sectors
dnl # that expose a 512-byte logical block size to the
dnl # operating system.
dnl #
dnl # Unfortunately, this interface isn't entirely reliable because
dnl # drives are sometimes known to misreport this value.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE], [
AC_MSG_CHECKING([whether bdev_physical_block_size() is available])
tmp_flags="$EXTRA_KCFLAGS"
EXTRA_KCFLAGS="-Wno-unused-but-set-variable"
ZFS_LINUX_TRY_COMPILE([
#include <linux/blkdev.h>
],[
struct block_device *bdev = NULL;
bdev_physical_block_size(bdev);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BDEV_PHYSICAL_BLOCK_SIZE, 1,
[bdev_physical_block_size() is available])
],[
AC_MSG_RESULT(no)
])
EXTRA_KCFLAGS="$tmp_flags"
])

View File

@ -14,6 +14,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE
ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS
ZFS_AC_KERNEL_BDEV_LOGICAL_BLOCK_SIZE
ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE
ZFS_AC_KERNEL_BIO_EMPTY_BARRIER
ZFS_AC_KERNEL_BIO_FAILFAST
ZFS_AC_KERNEL_BIO_FAILFAST_DTD

View File

@ -394,13 +394,27 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags)
/*
* 2.6.30 API change
* Change to make it explicit there this is the logical block size.
* To ensure good performance preferentially use the physical block size
* for proper alignment. The physical size is supposed to be the internal
* sector size used by the device. This is often 4096 byte for AF devices,
* while a smaller 512 byte logical size is supported for compatibility.
*
* Unfortunately, many drives still misreport their physical sector size.
* For devices which are known to lie you may need to manually set this
* at pool creation time with 'zpool create -o ashift=12 ...'.
*
* When the physical block size interface isn't available, we fall back to
* the logical block size interface and then the older hard sector size.
*/
#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev)
#ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE
# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev)
#else
# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev)
# else
# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
#endif
# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */
#endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */
/*
* 2.6.37 API change

View File

@ -47,6 +47,7 @@ extern "C" {
#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
#define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift"
#define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove"
#define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear"
#define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check"
@ -71,6 +72,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"

View File

@ -2,6 +2,7 @@
.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011 Nexenta Systems, Inc. All rights reserved.
.\" Copyright (c) 2012 by Delphix. All Rights Reserved.
.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the
.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
@ -16,12 +17,12 @@ zpool \- configures ZFS storage pools
.LP
.nf
\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...
\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ...
.fi
.LP
.nf
\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
.fi
.LP
@ -711,7 +712,7 @@ Displays a help message.
.ne 2
.mk
.na
\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR
\fB\fBzpool add\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIvdev\fR ...\fR
.ad
.sp .6
.RS 4n
@ -738,6 +739,17 @@ Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting r
Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
.RE
.sp
.ne 2
.mk
.na
\fB\fB-o\fR \fIproperty=value\fR
.ad
.sp .6
.RS 4n
Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift".
.RE
Do not add a disk that is currently configured as a quorum device to a zpool. After a disk is in the pool, that disk can then be configured as a quorum device.
.RE
@ -745,7 +757,7 @@ Do not add a disk that is currently configured as a quorum device to a zpool. Af
.ne 2
.mk
.na
\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
\fB\fBzpool attach\fR [\fB-f\fR] [\fB-o\fR \fIproperty=value\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
.ad
.sp .6
.RS 4n
@ -761,6 +773,17 @@ Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device
Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
.RE
.sp
.ne 2
.mk
.na
\fB\fB-o\fR \fIproperty=value\fR
.ad
.sp .6
.RS 4n
Sets the given pool properties. See the "Properties" section for a list of valid properties that can be set. The only property supported at the moment is "ashift".
.RE
.RE
.sp

View File

@ -1271,13 +1271,16 @@ vdev_open(vdev_t *vd)
vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
} else {
/*
* Make sure the alignment requirement hasn't increased.
* Detect if the alignment requirement has increased.
* We don't want to make the pool unavailable, just
* post an event instead.
*/
if (ashift > vd->vdev_top->vdev_ashift) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
if (ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, 0, 0);
}
vd->vdev_max_asize = max_asize;
}

View File

@ -267,6 +267,10 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
DATA_TYPE_STRING, vd->vdev_fru, NULL);
if (vd->vdev_ashift)
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
if (pvd != NULL) {
fm_payload_set(ereport,