From 4975430cf594859947735d42c8f45ad534ae27ad Mon Sep 17 00:00:00 2001 From: MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Date: Mon, 23 Feb 2026 12:34:33 -0500 Subject: [PATCH] Add vdev property to disable vdev scheduler Added vdev property to disable the vdev scheduler. The intention behind this property is to improve IOPS performance when using o_direct. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: MigeljanImeri Closes #17358 --- include/sys/fs/zfs.h | 13 +++ include/sys/vdev_impl.h | 2 + lib/libzfs/libzfs.abi | 3 +- man/man7/vdevprops.7 | 17 ++++ module/os/freebsd/zfs/vdev_geom.c | 3 + module/os/linux/zfs/vdev_disk.c | 3 + module/zcommon/zpool_prop.c | 11 +++ module/zfs/vdev.c | 16 ++++ module/zfs/vdev_file.c | 3 + module/zfs/vdev_queue.c | 40 ++++++++ tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zpool_get/vdev_get.cfg | 1 + .../cli_root/zpool_set/vdev_set_scheduler.ksh | 93 +++++++++++++++++++ 14 files changed, 206 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index ca929ed51..ab9b4e746 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -389,9 +389,22 @@ typedef enum { VDEV_PROP_SIT_OUT, VDEV_PROP_AUTOSIT, VDEV_PROP_SLOW_IO_EVENTS, + VDEV_PROP_SCHEDULER, VDEV_NUM_PROPS } vdev_prop_t; +/* + * Different scheduling behaviors for vdev scheduler property. + * VDEV_SCHEDULER_AUTO = Let ZFS decide - currently use scheduler on HDDs only. + * VDEV_SCHEDULER_ON = Always queue. + * VDEV_SCHEDULER_OFF = Never queue. + */ +typedef enum { + VDEV_SCHEDULER_AUTO, + VDEV_SCHEDULER_ON, + VDEV_SCHEDULER_OFF +} vdev_scheduler_type_t; + /* * Dataset property functions shared between libzfs and kernel. */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index afaa40134..634594aca 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -425,6 +425,7 @@ struct vdev { boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ + boolean_t vdev_is_blkdev; /* vdev is backed by block device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ @@ -473,6 +474,7 @@ struct vdev { boolean_t vdev_slow_io_events; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; + uint64_t vdev_scheduler; /* control how I/Os are submitted */ }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index a32f2231d..b51984f40 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -6255,7 +6255,8 @@ - + + diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index b54abcd3e..3b65a52ae 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -194,6 +194,23 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off +Controls how I/O requests are added to the vdev queue when reading or +writing to this vdev. +This property can be set on leaf vdevs. +The value of these properties do not persist across vdev replacement. +.Bl -tag -compact -width "auto" +.It Ar auto +Let ZFS choose which scheduler it thinks will be best. +Currently, the scheduler will queue I/O if the vdev is backed by a rotational +block device or file, and not queue otherwise. +.It Ar on +Always adds I/O requests to the vdev queue. +.It Ar off +Never adds I/O requests to the vdev queue. +This is not recommended for vdevs backed by spinning disks as it could +result in starvation. +.El .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index bbd1dafc6..be30c58cf 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -968,6 +968,9 @@ skip_open: else vd->vdev_nonrot = B_FALSE; + /* Is backed by a block device. */ + vd->vdev_is_blkdev = B_TRUE; + /* Set when device reports it supports TRIM. */ error = g_getattr("GEOM::candelete", cp, &has_trim); vd->vdev_has_trim = (error == 0 && has_trim); diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 1bd3500e9..4a9dbaa64 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); + /* Is backed by a block device. */ + v->vdev_is_blkdev = B_TRUE; + /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(bdev); diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 4826237b2..1be5f9d30 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -326,6 +326,13 @@ vdev_prop_init(void) { NULL } }; + static const zprop_index_t vdevschedulertype_table[] = { + { "auto", VDEV_SCHEDULER_AUTO }, + { "on", VDEV_SCHEDULER_ON }, + { "off", VDEV_SCHEDULER_OFF }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); @@ -484,6 +491,10 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events", B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SLOW_IO_EVENTS", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler", + VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV, + "auto | on | off", "IO_SCHEDULER", + vdevschedulertype_table, sfeatures); /* hidden properties */ zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index a8ae2d130..3480b884e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -767,6 +767,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); + vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER); + list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); @@ -3972,6 +3974,12 @@ vdev_load(vdev_t *vd) if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER, + &vd->vdev_scheduler); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); } /* @@ -6259,6 +6267,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_slow_io_t = intval; break; + case VDEV_PROP_SCHEDULER: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_scheduler = intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6664,6 +6679,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: + case VDEV_PROP_SCHEDULER: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 20b4db65e..da8fc3637 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, */ vd->vdev_nonrot = B_TRUE; + /* Is not backed by a block device. */ + vd->vdev_is_blkdev = B_FALSE; + /* * Allow TRIM on file based vdevs. This may not always be supported, * since it depends on your kernel version and underlying filesystem diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e69e55989..c03d0d2e1 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -879,6 +879,38 @@ again: return (zio); } +static boolean_t +vdev_should_queue_io(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + boolean_t should_queue = B_TRUE; + + /* + * Add zio with ZIO_FLAG_NODATA to queue as bypass code + * currently does not handle certain cases (gang abd, raidz + * write aggregation). + */ + if (zio->io_flags & ZIO_FLAG_NODATA) + return (B_TRUE); + + switch (vd->vdev_scheduler) { + case VDEV_SCHEDULER_AUTO: + if (vd->vdev_nonrot && vd->vdev_is_blkdev) + should_queue = B_FALSE; + break; + case VDEV_SCHEDULER_ON: + should_queue = B_TRUE; + break; + case VDEV_SCHEDULER_OFF: + should_queue = B_FALSE; + break; + default: + should_queue = B_TRUE; + break; + } + return (should_queue); +} + zio_t * vdev_queue_io(zio_t *zio) { @@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); + if (!vdev_should_queue_io(zio)) { + zio->io_queue_state = ZIO_QS_NONE; + return (zio); + } + mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); @@ -954,6 +991,9 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + if (zio->io_queue_state == ZIO_QS_NONE) + return; + mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 19df29ec3..8394bc4bc 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -564,7 +564,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg', - 'zpool_set_clear_userprop'] + 'zpool_set_clear_userprop','vdev_set_scheduler'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1a5cf6eba..e3fcce984 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1274,6 +1274,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ + functional/cli_root/zpool_set/vdev_set_scheduler.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 6d9aa2868..f59104e19 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -77,4 +77,5 @@ typeset -a properties=( trim_support trim_errors slow_ios + scheduler ) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh new file mode 100755 index 000000000..e8b5e97d7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Setting vdev scheduler property while reading from vdev should not cause panic. +# +# STRATEGY: +# 1. Create a zpool +# 2. Write a file to the pool. +# 3. Start reading from file, while also setting the scheduler property. +# + +verify_runnable "global" + +command -v fio > /dev/null || log_unsupported "fio missing" + +function set_scheduler +{ + for i in auto on off ; do + sleep 0.1 + zpool set scheduler=$i $TESTPOOL1 $FILEDEV + done +} + +function cleanup +{ + destroy_pool $TESTPOOL1 + log_must rm -f $FILEDEV +} + +log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic" +log_onexit cleanup + +# 1. Create a pool + +FILEDEV="$TEST_BASE_DIR/filedev.$$" +log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV +create_pool $TESTPOOL1 $FILEDEV + +mntpnt=$(get_prop mountpoint $TESTPOOL1) + +# 2. Write a file to the pool, while also setting the scheduler property. + +log_must eval "fio --filename=$mntpnt/foobar --name=write-file \ + --rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --time_based --runtime=2 &" + +ITERATIONS=4 + +for i in $(seq $ITERATIONS); do + log_must set_scheduler +done; +wait + +# 3. Starting reading from file, while also setting the scheduler property. + +log_must eval "fio --filename=$mntpnt/foobar --name=read-file \ + --rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --time_based --runtime=2 &" + +for i in $(seq $ITERATIONS); do + log_must set_scheduler +done; +wait + +log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"