diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index de689afa7..f3756a5d9 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2082,7 +2082,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; - uint_t c, i, children; + uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; @@ -2099,7 +2099,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -2199,6 +2199,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("unsupported feature(s)")); break; + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); @@ -8105,6 +8109,15 @@ status_callback(zpool_handle_t *zhp, void *data) "'zpool clear'.\n")); break; + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" diff --git a/include/libzfs.h b/include/libzfs.h index b405ad1e1..4e6336180 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -390,6 +390,7 @@ typedef enum { ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ + ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ /* * Finally, the following indicates a healthy pool. diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 9a3b29e1e..ec1da1a46 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -78,6 +78,12 @@ #define param_set_slop_shift_args(var) \ CTLTYPE_INT, &var, 0, param_set_slop_shift, "I" +#define param_set_min_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_min_auto_ashift, "QU" + +#define param_set_max_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" + #include #define module_init(fn) \ static void \ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 001893b71..d3acd674a 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -873,6 +873,7 @@ typedef enum vdev_aux { VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ + VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* @@ -1068,8 +1069,17 @@ typedef struct vdev_stat { uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ uint64_t vs_rebuild_processed; /* bytes rebuilt */ + uint64_t vs_configured_ashift; /* TLV vdev_ashift */ + uint64_t vs_logical_ashift; /* vdev_logical_ashift */ + uint64_t vs_physical_ashift; /* vdev_physical_ashift */ } vdev_stat_t; +/* BEGIN CSTYLED */ +#define VDEV_STAT_VALID(field, uint64_t_field_count) \ + ((uint64_t_field_count * sizeof (uint64_t)) >= \ + (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) +/* END CSTYLED */ + /* * Extended stats * diff --git a/include/sys/vdev.h b/include/sys/vdev.h index a7e880636..797065fdd 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -94,6 +94,7 @@ extern void vdev_rele(vdev_t *); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_ashift_optimize(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b9298c62d..90d607746 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -69,7 +69,7 @@ extern uint32_t zfs_vdev_async_write_max_active; * Virtual device operations */ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, - uint64_t *ashift); + uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); @@ -216,6 +216,25 @@ struct vdev { uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ + + /* + * Logical block alignment shift + * + * The smallest sized/aligned I/O supported by the device. + */ + uint64_t vdev_logical_ashift; + /* + * Physical block alignment shift + * + * The device supports logical I/Os with vdev_logical_ashift + * size/alignment, but optimum performance will be achieved by + * aligning/sizing requests to vdev_physical_ashift. Smaller + * requests may be inflated or incur device level read-modify-write + * operations. + * + * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). + */ + uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ @@ -586,6 +605,14 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); +/* + * Vdev ashift optimization tunables + */ +extern uint64_t zfs_vdev_min_auto_ashift; +extern uint64_t zfs_vdev_max_auto_ashift; +int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); +int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); + #ifdef __cplusplus } #endif diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index 67b8ea33e..435937041 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -94,57 +95,69 @@ static char *zfs_msgid_table[] = { /* ARGSUSED */ static int -vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) +vdev_missing(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN && - aux == VDEV_AUX_OPEN_FAILED); + return (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_OPEN_FAILED); } /* ARGSUSED */ static int -vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +vdev_faulted(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_FAULTED); + return (vs->vs_state == VDEV_STATE_FAULTED); } /* ARGSUSED */ static int -vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) +vdev_errors(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_DEGRADED || errs != 0); + return (vs->vs_state == VDEV_STATE_DEGRADED || + vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || + vs->vs_checksum_errors != 0); } /* ARGSUSED */ static int -vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) +vdev_broken(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN); + return (vs->vs_state == VDEV_STATE_CANT_OPEN); } /* ARGSUSED */ static int -vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) +vdev_offlined(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_OFFLINE); + return (vs->vs_state == VDEV_STATE_OFFLINE); } /* ARGSUSED */ static int -vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +vdev_removed(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_REMOVED); + return (vs->vs_state == VDEV_STATE_REMOVED); +} + +static int +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) +{ + if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL) + return (0); + + return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift); } /* * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), + boolean_t ignore_replacing) { nvlist_t **child; vdev_stat_t *vs; - uint_t c, children; - char *type; + uint_t c, vsc, children; /* * Ignore problems within a 'replacing' vdev, since we're presumably in @@ -152,23 +165,25 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * out again. We'll pick up the fact that a resilver is happening * later. */ - verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + if (ignore_replacing == B_TRUE) { + char *type; + + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } else { verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); - if (func(vs->vs_state, vs->vs_aux, - vs->vs_read_errors + - vs->vs_write_errors + - vs->vs_checksum_errors)) + if (func(vs, vsc) != 0) return (B_TRUE); } @@ -178,7 +193,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } @@ -362,15 +377,15 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted)) + find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing)) + find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken)) + find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* @@ -392,31 +407,37 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) /* * Missing devices in a replicated config. */ - if (find_vdev_problem(nvroot, vdev_faulted)) + if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing)) + if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken)) + if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors)) + if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ - if (find_vdev_problem(nvroot, vdev_offlined)) + if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ - if (find_vdev_problem(nvroot, vdev_removed)) + if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); + /* + * Suboptimal, but usable, ashift configuration. + */ + if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); + /* * Informational errata available. */ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 463e51acc..853e8fc94 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -486,6 +486,29 @@ Default limit for metaslab size. Default value: \fB29\fR [meaning (1 << 29) = 512MB]. .RE +.sp +.ne 2 +.na +\fBzfs_vdev_max_auto_ashift\fR (ulong) +.ad +.RS 12n +Maximum ashift used when optimizing for logical -> physical sector size on new +top-level vdevs. +.sp +Default value: \fBASHIFT_MAX\fR (16). +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_min_auto_ashift\fR (ulong) +.ad +.RS 12n +Minimum ashift used when creating new top-level vdevs. +.sp +Default value: \fBASHIFT_MIN\fR (9). +.RE + .sp .ne 2 .na diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index c23fa0591..0fe6866f3 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -470,6 +470,12 @@ The maximum time in milliseconds that .Nm zpool import will wait for an expected device to be available. .El +.Bl -tag -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE" +.It Ev ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE +If set, suppress warning about non-native vdev ashift in +.Nm zpool status . +The value is not used, only the presence or absence of the variable matters. +.El .Bl -tag -width "ZPOOL_VDEV_NAME_GUID" .It Ev ZPOOL_VDEV_NAME_GUID Cause diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index d76b49de9..200bbf43d 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -518,56 +519,53 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, /* vdev.c */ -#ifdef notyet -extern uint64_t zfs_max_auto_ashift; -extern uint64_t zfs_min_auto_ashift; - -static int -sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +int +param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; - val = zfs_max_auto_ashift; + val = zfs_vdev_min_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) - return (err); + return (SET_ERROR(err)); - if (val > ASHIFT_MAX || val < zfs_min_auto_ashift) - return (EINVAL); + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(EINVAL)); - zfs_max_auto_ashift = val; + zfs_vdev_min_auto_ashift = val; return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), - sysctl_vfs_zfs_max_auto_ashift, "QU", - "Max ashift used when optimising for logical -> physical sectors size on " - "new top-level vdevs."); -static int -sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) + +int +param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; - val = zfs_min_auto_ashift; + val = zfs_vdev_max_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) - return (err); + return (SET_ERROR(err)); - if (val < ASHIFT_MIN || val > zfs_max_auto_ashift) - return (EINVAL); + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(EINVAL)); - zfs_min_auto_ashift = val; + zfs_vdev_max_auto_ashift = val; return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), - sysctl_vfs_zfs_min_auto_ashift, "QU", - "Min ashift used when creating new top-level vdevs."); -#endif + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN, + &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + param_set_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN, + &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), + param_set_max_auto_ashift, "QU", + "Max ashift used when optimizing for logical -> physical sector size on " + "new top-level vdevs. (LEGACY)"); /* * Since the DTL space map of a vdev is not expected to have a lot of diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index cca6bffd9..4d27751c8 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -83,7 +83,7 @@ vdev_file_open_mode(spa_mode_t spa_mode) static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; zfs_file_t *fp; @@ -167,7 +167,8 @@ skip_open: } *max_psize = *psize = zfa.zfa_size; - *ashift = SPA_MINBLOCKSHIFT; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 8462755f8..bf06f6919 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -802,7 +802,7 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid) static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; @@ -949,11 +949,12 @@ skip_open: * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; -#ifdef notyet - if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && - pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0) + *physical_ashift = 0; + if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && + ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && + pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; -#endif + /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 5869b474d..5a2245436 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -159,7 +159,7 @@ vdev_disk_error(zio_t *zio) static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); @@ -270,7 +270,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, struct request_queue *q = bdev_get_queue(vd->vd_bdev); /* Determine the physical block size */ - int block_size = bdev_physical_block_size(vd->vd_bdev); + int physical_block_size = bdev_physical_block_size(vd->vd_bdev); + + /* Determine the logical block size */ + int logical_block_size = bdev_logical_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; @@ -291,7 +294,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ - *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = highbit64(MAX(physical_block_size, + SPA_MINBLOCKSIZE)) - 1; + + *logical_ashift = highbit64(MAX(logical_block_size, + SPA_MINBLOCKSIZE)) - 1; return (0); } @@ -824,3 +831,43 @@ char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); + +int +param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +int +param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 592ba2b4a..a4e71ca40 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -75,7 +75,7 @@ vdev_file_open_mode(spa_mode_t spa_mode) static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; zfs_file_t *fp; @@ -159,7 +159,8 @@ skip_open: } *max_psize = *psize = zfa.zfa_size; - *ashift = SPA_MINBLOCKSHIFT; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index fc62af7c7..bd1a993dc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9283,6 +9283,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); + vdev_ashift_optimize(vd); + /* * Create a new l2arc device entry. */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index e358404db..1e3728d93 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5747,6 +5747,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + vdev_ashift_optimize(vd); vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 95dd19844..cc65a00d9 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -576,8 +576,10 @@ spa_config_update(spa_t *spa, int what) (tvd->vdev_islog && tvd->vdev_removing)) continue; - if (tvd->vdev_ms_array == 0) + if (tvd->vdev_ms_array == 0) { + vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); + } vdev_expand(tvd, txg); } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index a51e427f8..1844a5653 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -110,6 +110,9 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; +uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; + /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) @@ -1176,6 +1179,8 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -1207,7 +1212,8 @@ vdev_remove_parent(vdev_t *cvd) mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; - + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); @@ -1677,7 +1683,8 @@ vdev_open(vdev_t *vd) uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; - uint64_t ashift = 0; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -1707,8 +1714,8 @@ vdev_open(vdev_t *vd) return (SET_ERROR(ENXIO)); } - error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); - + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy @@ -1823,6 +1830,17 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EINVAL)); } + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); + vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); + + if (vd->vdev_logical_ashift > ASHIFT_MAX) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (SET_ERROR(EDOM)); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1830,9 +1848,6 @@ vdev_open(vdev_t *vd) */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; - if (vd->vdev_ashift == 0) { - vd->vdev_ashift = ashift; /* use detected value */ - } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1841,16 +1856,17 @@ vdev_open(vdev_t *vd) } } else { /* - * Detect if the alignment requirement has increased. - * We don't want to make the pool unavailable, just - * post an event instead. + * Make sure the alignment required hasn't increased. */ - if (ashift > vd->vdev_top->vdev_ashift && + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0, 0); - } + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EDOM)); + } vd->vdev_max_asize = max_asize; } @@ -2428,6 +2444,35 @@ vdev_metaslab_set_size(vdev_t *vd) ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } +/* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +void +vdev_ashift_optimize(vdev_t *vd) +{ + if (vd == vd->vdev_top) { + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_vdev_min_auto_ashift, + vd->vdev_physical_ashift)); + } else { + /* + * Unusual case where logical ashift > physical ashift + * so we can't cap the calculated ashift based on max + * ashift as that would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, + vd->vdev_ashift); + } + } +} + void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { @@ -4083,6 +4128,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) 1ULL << tvd->vdev_ms_shift); } + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; + /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. @@ -5028,4 +5078,13 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, + param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, + "Minimum ashift used when creating new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, + param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, + "Maximum ashift used when optimizing for logical -> physical sector " + "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 4cd83d79e..6a944f4e8 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -950,11 +950,12 @@ vdev_indirect_close(vdev_t *vd) /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - *ashift = vd->vdev_ashift; + *logical_ashift = vd->vdev_ashift; + *physical_ashift = vd->vdev_physical_ashift; return (0); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 094530e9b..5e1060f12 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -366,7 +366,7 @@ vdev_mirror_map_init(zio_t *zio) static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; @@ -389,7 +389,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + vd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c index 205b23eba..ce90df6e8 100644 --- a/module/zfs/vdev_missing.c +++ b/module/zfs/vdev_missing.c @@ -45,7 +45,7 @@ /* ARGSUSED */ static int vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *ashift, uint64_t *pshift) { /* * Really this should just fail. But then the root vdev will be in the @@ -56,6 +56,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *psize = 0; *max_psize = 0; *ashift = 0; + *pshift = 0; return (0); } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index be3466673..8d4962805 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1554,7 +1554,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; @@ -1583,7 +1583,9 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index ce79f7c73..9e8aac7d0 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -82,7 +82,7 @@ too_many_errors(vdev_t *vd, uint64_t numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *ashift, uint64_t *pshift) { spa_t *spa = vd->vdev_spa; int lasterror = 0; @@ -116,6 +116,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = 0; *max_asize = 0; *ashift = 0; + *pshift = 0; return (0); }