Add vdev property to disable vdev scheduler

Added vdev property to disable the vdev scheduler.
The intention behind this property is to improve IOPS
performance when using o_direct.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com>
Signed-off-by: MigeljanImeri <ImeriMigel@gmail.com>
Closes #17358
This commit is contained in:
MigeljanImeri 2026-02-23 12:34:33 -05:00 committed by GitHub
parent d2f5cb3a50
commit 4975430cf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 206 additions and 2 deletions

View File

@ -389,9 +389,22 @@ typedef enum {
VDEV_PROP_SIT_OUT,
VDEV_PROP_AUTOSIT,
VDEV_PROP_SLOW_IO_EVENTS,
VDEV_PROP_SCHEDULER,
VDEV_NUM_PROPS
} vdev_prop_t;
/*
* Different scheduling behaviors for vdev scheduler property.
* VDEV_SCHEDULER_AUTO = Let ZFS decide - currently use scheduler on HDDs only.
* VDEV_SCHEDULER_ON = Always queue.
* VDEV_SCHEDULER_OFF = Never queue.
*/
typedef enum {
VDEV_SCHEDULER_AUTO,
VDEV_SCHEDULER_ON,
VDEV_SCHEDULER_OFF
} vdev_scheduler_type_t;
/*
* Dataset property functions shared between libzfs and kernel.
*/

View File

@ -425,6 +425,7 @@ struct vdev {
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
boolean_t vdev_attaching; /* vdev attach ashift handling */
boolean_t vdev_is_blkdev; /* vdev is backed by block device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
@ -473,6 +474,7 @@ struct vdev {
boolean_t vdev_slow_io_events;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
uint64_t vdev_scheduler; /* control how I/Os are submitted */
};
#define VDEV_PAD_SIZE (8 << 10)

View File

@ -6255,7 +6255,8 @@
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
<enumerator name='VDEV_PROP_AUTOSIT' value='53'/>
<enumerator name='VDEV_PROP_SLOW_IO_EVENTS' value='54'/>
<enumerator name='VDEV_NUM_PROPS' value='55'/>
<enumerator name='VDEV_PROP_SCHEDULER' value='55'/>
<enumerator name='VDEV_NUM_PROPS' value='56'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

View File

@ -194,6 +194,23 @@ If this device should perform new allocations, used to disable a device
when it is scheduled for later removal.
See
.Xr zpool-remove 8 .
.It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off
Controls how I/O requests are added to the vdev queue when reading or
writing to this vdev.
This property can be set on leaf vdevs.
The value of these properties do not persist across vdev replacement.
.Bl -tag -compact -width "auto"
.It Ar auto
Let ZFS choose which scheduler it thinks will be best.
Currently, the scheduler will queue I/O if the vdev is backed by a rotational
block device or file, and not queue otherwise.
.It Ar on
Always adds I/O requests to the vdev queue.
.It Ar off
Never adds I/O requests to the vdev queue.
This is not recommended for vdevs backed by spinning disks as it could
result in starvation.
.El
.El
.Ss User Properties
In addition to the standard native properties, ZFS supports arbitrary user

View File

@ -968,6 +968,9 @@ skip_open:
else
vd->vdev_nonrot = B_FALSE;
/* Is backed by a block device. */
vd->vdev_is_blkdev = B_TRUE;
/* Set when device reports it supports TRIM. */
error = g_getattr("GEOM::candelete", cp, &has_trim);
vd->vdev_has_trim = (error == 0 && has_trim);

View File

@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
/* Is backed by a block device. */
v->vdev_is_blkdev = B_TRUE;
/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(bdev);

View File

@ -326,6 +326,13 @@ vdev_prop_init(void)
{ NULL }
};
static const zprop_index_t vdevschedulertype_table[] = {
{ "auto", VDEV_SCHEDULER_AUTO },
{ "on", VDEV_SCHEDULER_ON },
{ "off", VDEV_SCHEDULER_OFF },
{ NULL }
};
struct zfs_mod_supported_features *sfeatures =
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
@ -484,6 +491,10 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
"SLOW_IO_EVENTS", boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler",
VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
"auto | on | off", "IO_SCHEDULER",
vdevschedulertype_table, sfeatures);
/* hidden properties */
zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,

View File

@ -767,6 +767,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER);
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
@ -3972,6 +3974,12 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER,
&vd->vdev_scheduler);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
}
/*
@ -6259,6 +6267,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_slow_io_t = intval;
break;
case VDEV_PROP_SCHEDULER:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_scheduler = intval;
break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@ -6664,6 +6679,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
case VDEV_PROP_SCHEDULER:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;

View File

@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*/
vd->vdev_nonrot = B_TRUE;
/* Is not backed by a block device. */
vd->vdev_is_blkdev = B_FALSE;
/*
* Allow TRIM on file based vdevs. This may not always be supported,
* since it depends on your kernel version and underlying filesystem

View File

@ -879,6 +879,38 @@ again:
return (zio);
}
static boolean_t
vdev_should_queue_io(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
boolean_t should_queue = B_TRUE;
/*
* Add zio with ZIO_FLAG_NODATA to queue as bypass code
* currently does not handle certain cases (gang abd, raidz
* write aggregation).
*/
if (zio->io_flags & ZIO_FLAG_NODATA)
return (B_TRUE);
switch (vd->vdev_scheduler) {
case VDEV_SCHEDULER_AUTO:
if (vd->vdev_nonrot && vd->vdev_is_blkdev)
should_queue = B_FALSE;
break;
case VDEV_SCHEDULER_ON:
should_queue = B_TRUE;
break;
case VDEV_SCHEDULER_OFF:
should_queue = B_FALSE;
break;
default:
should_queue = B_TRUE;
break;
}
return (should_queue);
}
zio_t *
vdev_queue_io(zio_t *zio)
{
@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
if (!vdev_should_queue_io(zio)) {
zio->io_queue_state = ZIO_QS_NONE;
return (zio);
}
mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
@ -954,6 +991,9 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
if (zio->io_queue_state == ZIO_QS_NONE)
return;
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);

View File

@ -564,7 +564,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
'user_property_001_pos', 'user_property_002_neg',
'zpool_set_clear_userprop']
'zpool_set_clear_userprop','vdev_set_scheduler']
tags = ['functional', 'cli_root', 'zpool_set']
[tests/functional/cli_root/zpool_split]

View File

@ -1274,6 +1274,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_set/setup.ksh \
functional/cli_root/zpool/setup.ksh \
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
functional/cli_root/zpool_set/vdev_set_scheduler.ksh \
functional/cli_root/zpool_set/zpool_set_common.kshlib \
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \

View File

@ -77,4 +77,5 @@ typeset -a properties=(
trim_support
trim_errors
slow_ios
scheduler
)

View File

@ -0,0 +1,93 @@
#!/bin/ksh -p
# SPDX-License-Identifier: CDDL-1.0
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2025 by Triad National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Setting vdev scheduler property while reading from vdev should not cause panic.
#
# STRATEGY:
# 1. Create a zpool
# 2. Write a file to the pool.
# 3. Start reading from file, while also setting the scheduler property.
#
verify_runnable "global"
command -v fio > /dev/null || log_unsupported "fio missing"
function set_scheduler
{
for i in auto on off ; do
sleep 0.1
zpool set scheduler=$i $TESTPOOL1 $FILEDEV
done
}
function cleanup
{
destroy_pool $TESTPOOL1
log_must rm -f $FILEDEV
}
log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic"
log_onexit cleanup
# 1. Create a pool
FILEDEV="$TEST_BASE_DIR/filedev.$$"
log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
create_pool $TESTPOOL1 $FILEDEV
mntpnt=$(get_prop mountpoint $TESTPOOL1)
# 2. Write a file to the pool, while also setting the scheduler property.
log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
--rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
--ioengine=sync --time_based --runtime=2 &"
ITERATIONS=4
for i in $(seq $ITERATIONS); do
log_must set_scheduler
done;
wait
# 3. Starting reading from file, while also setting the scheduler property.
log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
--rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
--ioengine=sync --time_based --runtime=2 &"
for i in $(seq $ITERATIONS); do
log_must set_scheduler
done;
wait
log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"