mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-03-10 12:26:27 +03:00
Add vdev property to disable vdev scheduler
Added vdev property to disable the vdev scheduler. The intention behind this property is to improve IOPS performance when using o_direct. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Signed-off-by: MigeljanImeri <ImeriMigel@gmail.com> Closes #17358
This commit is contained in:
parent
d2f5cb3a50
commit
4975430cf5
@ -389,9 +389,22 @@ typedef enum {
|
||||
VDEV_PROP_SIT_OUT,
|
||||
VDEV_PROP_AUTOSIT,
|
||||
VDEV_PROP_SLOW_IO_EVENTS,
|
||||
VDEV_PROP_SCHEDULER,
|
||||
VDEV_NUM_PROPS
|
||||
} vdev_prop_t;
|
||||
|
||||
/*
|
||||
* Different scheduling behaviors for vdev scheduler property.
|
||||
* VDEV_SCHEDULER_AUTO = Let ZFS decide - currently use scheduler on HDDs only.
|
||||
* VDEV_SCHEDULER_ON = Always queue.
|
||||
* VDEV_SCHEDULER_OFF = Never queue.
|
||||
*/
|
||||
typedef enum {
|
||||
VDEV_SCHEDULER_AUTO,
|
||||
VDEV_SCHEDULER_ON,
|
||||
VDEV_SCHEDULER_OFF
|
||||
} vdev_scheduler_type_t;
|
||||
|
||||
/*
|
||||
* Dataset property functions shared between libzfs and kernel.
|
||||
*/
|
||||
|
||||
@ -425,6 +425,7 @@ struct vdev {
|
||||
boolean_t vdev_resilver_deferred; /* resilver deferred */
|
||||
boolean_t vdev_kobj_flag; /* kobj event record */
|
||||
boolean_t vdev_attaching; /* vdev attach ashift handling */
|
||||
boolean_t vdev_is_blkdev; /* vdev is backed by block device */
|
||||
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
|
||||
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
|
||||
zio_t *vdev_probe_zio; /* root of current probe */
|
||||
@ -473,6 +474,7 @@ struct vdev {
|
||||
boolean_t vdev_slow_io_events;
|
||||
uint64_t vdev_slow_io_n;
|
||||
uint64_t vdev_slow_io_t;
|
||||
uint64_t vdev_scheduler; /* control how I/Os are submitted */
|
||||
};
|
||||
|
||||
#define VDEV_PAD_SIZE (8 << 10)
|
||||
|
||||
@ -6255,7 +6255,8 @@
|
||||
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
|
||||
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
|
||||
<enumerator name='VDEV_PROP_SLOW_IO_EVENTS' value='54'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='55'/>
|
||||
<enumerator name='VDEV_PROP_SCHEDULER' value='55'/>
|
||||
<enumerator name='VDEV_NUM_PROPS' value='56'/>
|
||||
</enum-decl>
|
||||
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
|
||||
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
|
||||
|
||||
@ -194,6 +194,23 @@ If this device should perform new allocations, used to disable a device
|
||||
when it is scheduled for later removal.
|
||||
See
|
||||
.Xr zpool-remove 8 .
|
||||
.It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off
|
||||
Controls how I/O requests are added to the vdev queue when reading or
|
||||
writing to this vdev.
|
||||
This property can be set on leaf vdevs.
|
||||
The value of these properties do not persist across vdev replacement.
|
||||
.Bl -tag -compact -width "auto"
|
||||
.It Ar auto
|
||||
Let ZFS choose which scheduler it thinks will be best.
|
||||
Currently, the scheduler will queue I/O if the vdev is backed by a rotational
|
||||
block device or file, and not queue otherwise.
|
||||
.It Ar on
|
||||
Always adds I/O requests to the vdev queue.
|
||||
.It Ar off
|
||||
Never adds I/O requests to the vdev queue.
|
||||
This is not recommended for vdevs backed by spinning disks as it could
|
||||
result in starvation.
|
||||
.El
|
||||
.El
|
||||
.Ss User Properties
|
||||
In addition to the standard native properties, ZFS supports arbitrary user
|
||||
|
||||
@ -968,6 +968,9 @@ skip_open:
|
||||
else
|
||||
vd->vdev_nonrot = B_FALSE;
|
||||
|
||||
/* Is backed by a block device. */
|
||||
vd->vdev_is_blkdev = B_TRUE;
|
||||
|
||||
/* Set when device reports it supports TRIM. */
|
||||
error = g_getattr("GEOM::candelete", cp, &has_trim);
|
||||
vd->vdev_has_trim = (error == 0 && has_trim);
|
||||
|
||||
@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
|
||||
/* Inform the ZIO pipeline that we are non-rotational */
|
||||
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
|
||||
|
||||
/* Is backed by a block device. */
|
||||
v->vdev_is_blkdev = B_TRUE;
|
||||
|
||||
/* Physical volume size in bytes for the partition */
|
||||
*psize = bdev_capacity(bdev);
|
||||
|
||||
|
||||
@ -326,6 +326,13 @@ vdev_prop_init(void)
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static const zprop_index_t vdevschedulertype_table[] = {
|
||||
{ "auto", VDEV_SCHEDULER_AUTO },
|
||||
{ "on", VDEV_SCHEDULER_ON },
|
||||
{ "off", VDEV_SCHEDULER_OFF },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
struct zfs_mod_supported_features *sfeatures =
|
||||
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
|
||||
|
||||
@ -484,6 +491,10 @@ vdev_prop_init(void)
|
||||
zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
|
||||
B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
|
||||
"SLOW_IO_EVENTS", boolean_table, sfeatures);
|
||||
zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler",
|
||||
VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
|
||||
"auto | on | off", "IO_SCHEDULER",
|
||||
vdevschedulertype_table, sfeatures);
|
||||
|
||||
/* hidden properties */
|
||||
zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
|
||||
|
||||
@ -767,6 +767,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
|
||||
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
|
||||
|
||||
vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER);
|
||||
|
||||
list_link_init(&vd->vdev_config_dirty_node);
|
||||
list_link_init(&vd->vdev_state_dirty_node);
|
||||
list_link_init(&vd->vdev_initialize_node);
|
||||
@ -3972,6 +3974,12 @@ vdev_load(vdev_t *vd)
|
||||
if (error && error != ENOENT)
|
||||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
|
||||
"failed [error=%d]", (u_longlong_t)zapobj, error);
|
||||
|
||||
error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER,
|
||||
&vd->vdev_scheduler);
|
||||
if (error && error != ENOENT)
|
||||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
|
||||
"failed [error=%d]", (u_longlong_t)zapobj, error);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -6259,6 +6267,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
||||
}
|
||||
vd->vdev_slow_io_t = intval;
|
||||
break;
|
||||
case VDEV_PROP_SCHEDULER:
|
||||
if (nvpair_value_uint64(elem, &intval) != 0) {
|
||||
error = EINVAL;
|
||||
break;
|
||||
}
|
||||
vd->vdev_scheduler = intval;
|
||||
break;
|
||||
default:
|
||||
/* Most processing is done in vdev_props_set_sync */
|
||||
break;
|
||||
@ -6664,6 +6679,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
|
||||
case VDEV_PROP_IO_T:
|
||||
case VDEV_PROP_SLOW_IO_N:
|
||||
case VDEV_PROP_SLOW_IO_T:
|
||||
case VDEV_PROP_SCHEDULER:
|
||||
err = vdev_prop_get_int(vd, prop, &intval);
|
||||
if (err && err != ENOENT)
|
||||
break;
|
||||
|
||||
@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
|
||||
*/
|
||||
vd->vdev_nonrot = B_TRUE;
|
||||
|
||||
/* Is not backed by a block device. */
|
||||
vd->vdev_is_blkdev = B_FALSE;
|
||||
|
||||
/*
|
||||
* Allow TRIM on file based vdevs. This may not always be supported,
|
||||
* since it depends on your kernel version and underlying filesystem
|
||||
|
||||
@ -879,6 +879,38 @@ again:
|
||||
return (zio);
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
vdev_should_queue_io(zio_t *zio)
|
||||
{
|
||||
vdev_t *vd = zio->io_vd;
|
||||
boolean_t should_queue = B_TRUE;
|
||||
|
||||
/*
|
||||
* Add zio with ZIO_FLAG_NODATA to queue as bypass code
|
||||
* currently does not handle certain cases (gang abd, raidz
|
||||
* write aggregation).
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_NODATA)
|
||||
return (B_TRUE);
|
||||
|
||||
switch (vd->vdev_scheduler) {
|
||||
case VDEV_SCHEDULER_AUTO:
|
||||
if (vd->vdev_nonrot && vd->vdev_is_blkdev)
|
||||
should_queue = B_FALSE;
|
||||
break;
|
||||
case VDEV_SCHEDULER_ON:
|
||||
should_queue = B_TRUE;
|
||||
break;
|
||||
case VDEV_SCHEDULER_OFF:
|
||||
should_queue = B_FALSE;
|
||||
break;
|
||||
default:
|
||||
should_queue = B_TRUE;
|
||||
break;
|
||||
}
|
||||
return (should_queue);
|
||||
}
|
||||
|
||||
zio_t *
|
||||
vdev_queue_io(zio_t *zio)
|
||||
{
|
||||
@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
|
||||
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
|
||||
zio->io_timestamp = gethrtime();
|
||||
|
||||
if (!vdev_should_queue_io(zio)) {
|
||||
zio->io_queue_state = ZIO_QS_NONE;
|
||||
return (zio);
|
||||
}
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
vdev_queue_io_add(vq, zio);
|
||||
nio = vdev_queue_io_to_issue(vq);
|
||||
@ -954,6 +991,9 @@ vdev_queue_io_done(zio_t *zio)
|
||||
vq->vq_io_complete_ts = now;
|
||||
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
|
||||
|
||||
if (zio->io_queue_state == ZIO_QS_NONE)
|
||||
return;
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
vdev_queue_pending_remove(vq, zio);
|
||||
|
||||
|
||||
@ -564,7 +564,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
|
||||
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
|
||||
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
|
||||
'user_property_001_pos', 'user_property_002_neg',
|
||||
'zpool_set_clear_userprop']
|
||||
'zpool_set_clear_userprop','vdev_set_scheduler']
|
||||
tags = ['functional', 'cli_root', 'zpool_set']
|
||||
|
||||
[tests/functional/cli_root/zpool_split]
|
||||
|
||||
@ -1274,6 +1274,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/cli_root/zpool_set/setup.ksh \
|
||||
functional/cli_root/zpool/setup.ksh \
|
||||
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
|
||||
functional/cli_root/zpool_set/vdev_set_scheduler.ksh \
|
||||
functional/cli_root/zpool_set/zpool_set_common.kshlib \
|
||||
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
|
||||
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
|
||||
|
||||
@ -77,4 +77,5 @@ typeset -a properties=(
|
||||
trim_support
|
||||
trim_errors
|
||||
slow_ios
|
||||
scheduler
|
||||
)
|
||||
|
||||
93
tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh
Executable file
93
tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh
Executable file
@ -0,0 +1,93 @@
|
||||
#!/bin/ksh -p
|
||||
# SPDX-License-Identifier: CDDL-1.0
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2025 by Triad National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Setting vdev scheduler property while reading from vdev should not cause panic.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a zpool
|
||||
# 2. Write a file to the pool.
|
||||
# 3. Start reading from file, while also setting the scheduler property.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
command -v fio > /dev/null || log_unsupported "fio missing"
|
||||
|
||||
function set_scheduler
|
||||
{
|
||||
for i in auto on off ; do
|
||||
sleep 0.1
|
||||
zpool set scheduler=$i $TESTPOOL1 $FILEDEV
|
||||
done
|
||||
}
|
||||
|
||||
function cleanup
|
||||
{
|
||||
destroy_pool $TESTPOOL1
|
||||
log_must rm -f $FILEDEV
|
||||
}
|
||||
|
||||
log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic"
|
||||
log_onexit cleanup
|
||||
|
||||
# 1. Create a pool
|
||||
|
||||
FILEDEV="$TEST_BASE_DIR/filedev.$$"
|
||||
log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
|
||||
create_pool $TESTPOOL1 $FILEDEV
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL1)
|
||||
|
||||
# 2. Write a file to the pool, while also setting the scheduler property.
|
||||
|
||||
log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
|
||||
--rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
|
||||
--ioengine=sync --time_based --runtime=2 &"
|
||||
|
||||
ITERATIONS=4
|
||||
|
||||
for i in $(seq $ITERATIONS); do
|
||||
log_must set_scheduler
|
||||
done;
|
||||
wait
|
||||
|
||||
# 3. Starting reading from file, while also setting the scheduler property.
|
||||
|
||||
log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
|
||||
--rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
|
||||
--ioengine=sync --time_based --runtime=2 &"
|
||||
|
||||
for i in $(seq $ITERATIONS); do
|
||||
log_must set_scheduler
|
||||
done;
|
||||
wait
|
||||
|
||||
log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"
|
||||
Loading…
Reference in New Issue
Block a user