mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 18:34:22 +03:00
Add "ashift" property to zpool create
Some disks with internal sectors larger than 512 bytes (e.g., 4k) can suffer from bad write performance when ashift is not configured correctly. This is caused by the disk not reporting its actual sector size, but a sector size of 512 bytes. The drive may behave this way for compatibility reasons. For example, the WDC WD20EARS disks are known to exhibit this behavior. When creating a zpool, ZFS takes that wrong sector size and sets the "ashift" property accordingly (to 9: 1<<9=512), whereas it should be set to 12 for 4k sectors (1<<12=4096). This patch allows an adminstrator to manual specify the known correct ashift size at 'zpool create' time. This can significantly improve performance in certain cases. However, it will have an impact on your total pool capacity. See the updated ashift property description in the zpool.8 man page for additional details. Valid values for the ashift property range from 9 to 17 (512B-128KB). Additionally, you may set the ashift to 0 if you wish to auto-detect the sector size based on what the disk reports, this is the default behavior. The most common ashift values are 9 and 12. Example: zpool create -o ashift=12 tank raidz2 sda sdb sdc sdd Closes #280 Original-patch-by: Richard Laager <rlaager@wiktel.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
This commit is contained in:
parent
96801d2906
commit
df30f56639
3
AUTHORS
3
AUTHORS
@ -34,3 +34,6 @@ to the project and deserve to be acknowledged.
|
|||||||
Jim Garlick <garlick@llnl.gov>
|
Jim Garlick <garlick@llnl.gov>
|
||||||
Manuel Amador (Rudd-O) <rudd-o@rudd-o.com>
|
Manuel Amador (Rudd-O) <rudd-o@rudd-o.com>
|
||||||
Gunnar Beutner <gunnar@beutner.name>
|
Gunnar Beutner <gunnar@beutner.name>
|
||||||
|
Darik Horn <dajhorn@vanadac.com>
|
||||||
|
Richard Laager <rlaager@wiktel.com>
|
||||||
|
Christian Kohlschütter <christian@kohlschutter.com>
|
||||||
|
@ -488,7 +488,7 @@ zpool_do_add(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* pass off to get_vdev_spec for processing */
|
/* pass off to get_vdev_spec for processing */
|
||||||
nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun,
|
nvroot = make_root_vdev(zhp, NULL, force, !force, B_FALSE, dryrun,
|
||||||
argc, argv);
|
argc, argv);
|
||||||
if (nvroot == NULL) {
|
if (nvroot == NULL) {
|
||||||
zpool_close(zhp);
|
zpool_close(zhp);
|
||||||
@ -688,7 +688,7 @@ zpool_do_create(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* pass off to get_vdev_spec for bulk processing */
|
/* pass off to get_vdev_spec for bulk processing */
|
||||||
nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun,
|
nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun,
|
||||||
argc - 1, argv + 1);
|
argc - 1, argv + 1);
|
||||||
if (nvroot == NULL)
|
if (nvroot == NULL)
|
||||||
goto errout;
|
goto errout;
|
||||||
@ -2683,7 +2683,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
|
|||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE,
|
nvroot = make_root_vdev(zhp, NULL, force, B_FALSE, replacing, B_FALSE,
|
||||||
argc, argv);
|
argc, argv);
|
||||||
if (nvroot == NULL) {
|
if (nvroot == NULL) {
|
||||||
zpool_close(zhp);
|
zpool_close(zhp);
|
||||||
|
@ -43,8 +43,8 @@ uint_t num_logs(nvlist_t *nv);
|
|||||||
* Virtual device functions
|
* Virtual device functions
|
||||||
*/
|
*/
|
||||||
|
|
||||||
nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
|
nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force,
|
||||||
boolean_t replacing, boolean_t dryrun, int argc, char **argv);
|
int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv);
|
||||||
nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
|
nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
|
||||||
nvlist_t *props, splitflags_t flags, int argc, char **argv);
|
nvlist_t *props, splitflags_t flags, int argc, char **argv);
|
||||||
|
|
||||||
|
@ -407,7 +407,7 @@ is_shorthand_path(const char *arg, char *path,
|
|||||||
* xxx Shorthand for /dev/disk/yyy/xxx
|
* xxx Shorthand for /dev/disk/yyy/xxx
|
||||||
*/
|
*/
|
||||||
static nvlist_t *
|
static nvlist_t *
|
||||||
make_leaf_vdev(const char *arg, uint64_t is_log)
|
make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
|
||||||
{
|
{
|
||||||
char path[MAXPATHLEN];
|
char path[MAXPATHLEN];
|
||||||
struct stat64 statbuf;
|
struct stat64 statbuf;
|
||||||
@ -499,6 +499,19 @@ make_leaf_vdev(const char *arg, uint64_t is_log)
|
|||||||
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
|
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
|
||||||
(uint64_t)wholedisk) == 0);
|
(uint64_t)wholedisk) == 0);
|
||||||
|
|
||||||
|
if (props != NULL) {
|
||||||
|
uint64_t ashift = 0;
|
||||||
|
char *value = NULL;
|
||||||
|
|
||||||
|
if (nvlist_lookup_string(props,
|
||||||
|
zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0)
|
||||||
|
zfs_nicestrtonum(NULL, value, &ashift);
|
||||||
|
|
||||||
|
if (ashift > 0)
|
||||||
|
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT,
|
||||||
|
ashift) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
return (vdev);
|
return (vdev);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1195,7 +1208,7 @@ is_grouping(const char *type, int *mindev, int *maxdev)
|
|||||||
* because the program is just going to exit anyway.
|
* because the program is just going to exit anyway.
|
||||||
*/
|
*/
|
||||||
nvlist_t *
|
nvlist_t *
|
||||||
construct_spec(int argc, char **argv)
|
construct_spec(nvlist_t *props, int argc, char **argv)
|
||||||
{
|
{
|
||||||
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
|
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
|
||||||
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
|
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
|
||||||
@ -1284,7 +1297,7 @@ construct_spec(int argc, char **argv)
|
|||||||
children * sizeof (nvlist_t *));
|
children * sizeof (nvlist_t *));
|
||||||
if (child == NULL)
|
if (child == NULL)
|
||||||
zpool_no_memory();
|
zpool_no_memory();
|
||||||
if ((nv = make_leaf_vdev(argv[c], B_FALSE))
|
if ((nv = make_leaf_vdev(props, argv[c], B_FALSE))
|
||||||
== NULL)
|
== NULL)
|
||||||
return (NULL);
|
return (NULL);
|
||||||
child[children - 1] = nv;
|
child[children - 1] = nv;
|
||||||
@ -1340,7 +1353,7 @@ construct_spec(int argc, char **argv)
|
|||||||
* We have a device. Pass off to make_leaf_vdev() to
|
* We have a device. Pass off to make_leaf_vdev() to
|
||||||
* construct the appropriate nvlist describing the vdev.
|
* construct the appropriate nvlist describing the vdev.
|
||||||
*/
|
*/
|
||||||
if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
|
if ((nv = make_leaf_vdev(props, argv[0], is_log)) == NULL)
|
||||||
return (NULL);
|
return (NULL);
|
||||||
if (is_log)
|
if (is_log)
|
||||||
nlogs++;
|
nlogs++;
|
||||||
@ -1406,7 +1419,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
|
|||||||
uint_t c, children;
|
uint_t c, children;
|
||||||
|
|
||||||
if (argc > 0) {
|
if (argc > 0) {
|
||||||
if ((newroot = construct_spec(argc, argv)) == NULL) {
|
if ((newroot = construct_spec(props, argc, argv)) == NULL) {
|
||||||
(void) fprintf(stderr, gettext("Unable to build a "
|
(void) fprintf(stderr, gettext("Unable to build a "
|
||||||
"pool from the specified devices\n"));
|
"pool from the specified devices\n"));
|
||||||
return (NULL);
|
return (NULL);
|
||||||
@ -1456,7 +1469,7 @@ split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
|
|||||||
* added, even if they appear in use.
|
* added, even if they appear in use.
|
||||||
*/
|
*/
|
||||||
nvlist_t *
|
nvlist_t *
|
||||||
make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
|
make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
|
||||||
boolean_t replacing, boolean_t dryrun, int argc, char **argv)
|
boolean_t replacing, boolean_t dryrun, int argc, char **argv)
|
||||||
{
|
{
|
||||||
nvlist_t *newroot;
|
nvlist_t *newroot;
|
||||||
@ -1468,7 +1481,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
|
|||||||
* that we have a valid specification, and that all devices can be
|
* that we have a valid specification, and that all devices can be
|
||||||
* opened.
|
* opened.
|
||||||
*/
|
*/
|
||||||
if ((newroot = construct_spec(argc, argv)) == NULL)
|
if ((newroot = construct_spec(props, argc, argv)) == NULL)
|
||||||
return (NULL);
|
return (NULL);
|
||||||
|
|
||||||
if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
|
if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
|
||||||
|
@ -161,6 +161,7 @@ typedef enum {
|
|||||||
ZPOOL_PROP_FREE,
|
ZPOOL_PROP_FREE,
|
||||||
ZPOOL_PROP_ALLOCATED,
|
ZPOOL_PROP_ALLOCATED,
|
||||||
ZPOOL_PROP_READONLY,
|
ZPOOL_PROP_READONLY,
|
||||||
|
ZPOOL_PROP_ASHIFT,
|
||||||
ZPOOL_NUM_PROPS
|
ZPOOL_NUM_PROPS
|
||||||
} zpool_prop_t;
|
} zpool_prop_t;
|
||||||
|
|
||||||
|
@ -270,6 +270,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
|
|||||||
case ZPOOL_PROP_SIZE:
|
case ZPOOL_PROP_SIZE:
|
||||||
case ZPOOL_PROP_ALLOCATED:
|
case ZPOOL_PROP_ALLOCATED:
|
||||||
case ZPOOL_PROP_FREE:
|
case ZPOOL_PROP_FREE:
|
||||||
|
case ZPOOL_PROP_ASHIFT:
|
||||||
(void) zfs_nicenum(intval, buf, len);
|
(void) zfs_nicenum(intval, buf, len);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -433,6 +434,24 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ZPOOL_PROP_ASHIFT:
|
||||||
|
if (!flags.create) {
|
||||||
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||||
|
"property '%s' can only be set at "
|
||||||
|
"creation time"), propname);
|
||||||
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (intval != 0 && (intval < 9 || intval > 17)) {
|
||||||
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||||
|
"property '%s' number %d is invalid."),
|
||||||
|
propname, intval);
|
||||||
|
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case ZPOOL_PROP_BOOTFS:
|
case ZPOOL_PROP_BOOTFS:
|
||||||
if (flags.create || flags.import) {
|
if (flags.create || flags.import) {
|
||||||
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
|
||||||
|
@ -502,6 +502,23 @@ Amount of storage space used within the pool.
|
|||||||
.sp
|
.sp
|
||||||
.LP
|
.LP
|
||||||
These space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
|
These space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(8) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.LP
|
||||||
|
The following property can be set at creation time:
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.mk
|
||||||
|
.na
|
||||||
|
\fB\fBashift\fR\fR
|
||||||
|
.ad
|
||||||
|
.sp .6
|
||||||
|
.RS 4n
|
||||||
|
Pool sector size exponent, to the power of 2 (internally referred to as "ashift"). I/O operations will be aligned to the specified size boundaries. Additionally, the minimum (disk) write size will be set to the specified size, so this represents a space vs. performance trade-off. The typical case for setting this property is when performance is important and the underlying disks use 4KiB sectors but report 512B sectors to the OS (for compatibility reasons); in that case, set \fBashift=12\fR (which is 1<<12 = 4096).
|
||||||
|
.LP
|
||||||
|
For optimal performance, the pool sector size should be greater than or equal to the sector size of the underlying disks. Since the property cannot be changed after pool creation, if in a given pool, you \fIever\fR want to use drives that \fIreport\fR 4KiB sectors, you must set \fBashift=12\fR at pool creation time.
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.LP
|
.LP
|
||||||
The following property can be set at creation time and import time:
|
The following property can be set at creation time and import time:
|
||||||
|
@ -87,6 +87,10 @@ zpool_prop_init(void)
|
|||||||
PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
|
PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
|
||||||
"DEDUP");
|
"DEDUP");
|
||||||
|
|
||||||
|
/* readonly onetime number properties */
|
||||||
|
zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_ONETIME,
|
||||||
|
ZFS_TYPE_POOL, "<ashift, 9-17, or 0=default>", "ASHIFT");
|
||||||
|
|
||||||
/* default number properties */
|
/* default number properties */
|
||||||
zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
|
zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
|
||||||
PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
|
PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
|
||||||
|
Loading…
Reference in New Issue
Block a user