diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index ca929ed51..ab9b4e746 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -389,9 +389,22 @@ typedef enum {
VDEV_PROP_SIT_OUT,
VDEV_PROP_AUTOSIT,
VDEV_PROP_SLOW_IO_EVENTS,
+ VDEV_PROP_SCHEDULER,
VDEV_NUM_PROPS
} vdev_prop_t;
+/*
+ * Different scheduling behaviors for vdev scheduler property.
+ * VDEV_SCHEDULER_AUTO = Let ZFS decide - currently use scheduler on HDDs only.
+ * VDEV_SCHEDULER_ON = Always queue.
+ * VDEV_SCHEDULER_OFF = Never queue.
+ */
+typedef enum {
+ VDEV_SCHEDULER_AUTO,
+ VDEV_SCHEDULER_ON,
+ VDEV_SCHEDULER_OFF
+} vdev_scheduler_type_t;
+
/*
* Dataset property functions shared between libzfs and kernel.
*/
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index afaa40134..634594aca 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -425,6 +425,7 @@ struct vdev {
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
boolean_t vdev_attaching; /* vdev attach ashift handling */
+ boolean_t vdev_is_blkdev; /* vdev is backed by block device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
@@ -473,6 +474,7 @@ struct vdev {
boolean_t vdev_slow_io_events;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
+ uint64_t vdev_scheduler; /* control how I/Os are submitted */
};
#define VDEV_PAD_SIZE (8 << 10)
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index a32f2231d..b51984f40 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -6255,7 +6255,8 @@
-
+
+
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index b54abcd3e..3b65a52ae 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -194,6 +194,23 @@ If this device should perform new allocations, used to disable a device
when it is scheduled for later removal.
See
.Xr zpool-remove 8 .
+.It Sy scheduler Ns = Ns Sy auto Ns | Ns Sy on Ns | Ns Sy off
+Controls how I/O requests are added to the vdev queue when reading or
+writing to this vdev.
+This property can be set on leaf vdevs.
+The value of these properties do not persist across vdev replacement.
+.Bl -tag -compact -width "auto"
+.It Ar auto
+Let ZFS choose which scheduler it thinks will be best.
+Currently, the scheduler will queue I/O if the vdev is backed by a rotational
+block device or file, and not queue otherwise.
+.It Ar on
+Always adds I/O requests to the vdev queue.
+.It Ar off
+Never adds I/O requests to the vdev queue.
+This is not recommended for vdevs backed by spinning disks as it could
+result in starvation.
+.El
.El
.Ss User Properties
In addition to the standard native properties, ZFS supports arbitrary user
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index bbd1dafc6..be30c58cf 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -968,6 +968,9 @@ skip_open:
else
vd->vdev_nonrot = B_FALSE;
+ /* Is backed by a block device. */
+ vd->vdev_is_blkdev = B_TRUE;
+
/* Set when device reports it supports TRIM. */
error = g_getattr("GEOM::candelete", cp, &has_trim);
vd->vdev_has_trim = (error == 0 && has_trim);
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 1bd3500e9..4a9dbaa64 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
+ /* Is backed by a block device. */
+ v->vdev_is_blkdev = B_TRUE;
+
/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(bdev);
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 4826237b2..1be5f9d30 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -326,6 +326,13 @@ vdev_prop_init(void)
{ NULL }
};
+ static const zprop_index_t vdevschedulertype_table[] = {
+ { "auto", VDEV_SCHEDULER_AUTO },
+ { "on", VDEV_SCHEDULER_ON },
+ { "off", VDEV_SCHEDULER_OFF },
+ { NULL }
+ };
+
struct zfs_mod_supported_features *sfeatures =
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
@@ -484,6 +491,10 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_SLOW_IO_EVENTS, "slow_io_events",
B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off",
"SLOW_IO_EVENTS", boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler",
+ VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
+ "auto | on | off", "IO_SCHEDULER",
+ vdevschedulertype_table, sfeatures);
/* hidden properties */
zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING,
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index a8ae2d130..3480b884e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -767,6 +767,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
+ vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER);
+
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
@@ -3972,6 +3974,12 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER,
+ &vd->vdev_scheduler);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
}
/*
@@ -6259,6 +6267,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_slow_io_t = intval;
break;
+ case VDEV_PROP_SCHEDULER:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_scheduler = intval;
+ break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@@ -6664,6 +6679,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
+ case VDEV_PROP_SCHEDULER:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index 20b4db65e..da8fc3637 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*/
vd->vdev_nonrot = B_TRUE;
+ /* Is not backed by a block device. */
+ vd->vdev_is_blkdev = B_FALSE;
+
/*
* Allow TRIM on file based vdevs. This may not always be supported,
* since it depends on your kernel version and underlying filesystem
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index e69e55989..c03d0d2e1 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -879,6 +879,38 @@ again:
return (zio);
}
+static boolean_t
+vdev_should_queue_io(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ boolean_t should_queue = B_TRUE;
+
+ /*
+ * Add zio with ZIO_FLAG_NODATA to queue as bypass code
+ * currently does not handle certain cases (gang abd, raidz
+ * write aggregation).
+ */
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ return (B_TRUE);
+
+ switch (vd->vdev_scheduler) {
+ case VDEV_SCHEDULER_AUTO:
+ if (vd->vdev_nonrot && vd->vdev_is_blkdev)
+ should_queue = B_FALSE;
+ break;
+ case VDEV_SCHEDULER_ON:
+ should_queue = B_TRUE;
+ break;
+ case VDEV_SCHEDULER_OFF:
+ should_queue = B_FALSE;
+ break;
+ default:
+ should_queue = B_TRUE;
+ break;
+ }
+ return (should_queue);
+}
+
zio_t *
vdev_queue_io(zio_t *zio)
{
@@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
+ if (!vdev_should_queue_io(zio)) {
+ zio->io_queue_state = ZIO_QS_NONE;
+ return (zio);
+ }
+
mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
@@ -954,6 +991,9 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
+ if (zio->io_queue_state == ZIO_QS_NONE)
+ return;
+
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 19df29ec3..8394bc4bc 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -564,7 +564,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
'user_property_001_pos', 'user_property_002_neg',
- 'zpool_set_clear_userprop']
+ 'zpool_set_clear_userprop','vdev_set_scheduler']
tags = ['functional', 'cli_root', 'zpool_set']
[tests/functional/cli_root/zpool_split]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 1a5cf6eba..e3fcce984 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1274,6 +1274,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_set/setup.ksh \
functional/cli_root/zpool/setup.ksh \
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
+ functional/cli_root/zpool_set/vdev_set_scheduler.ksh \
functional/cli_root/zpool_set/zpool_set_common.kshlib \
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
index 6d9aa2868..f59104e19 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
@@ -77,4 +77,5 @@ typeset -a properties=(
trim_support
trim_errors
slow_ios
+ scheduler
)
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh
new file mode 100755
index 000000000..e8b5e97d7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_scheduler.ksh
@@ -0,0 +1,93 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Triad National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Setting vdev scheduler property while reading from vdev should not cause panic.
+#
+# STRATEGY:
+# 1. Create a zpool
+# 2. Write a file to the pool.
+# 3. Start reading from file, while also setting the scheduler property.
+#
+
+verify_runnable "global"
+
+command -v fio > /dev/null || log_unsupported "fio missing"
+
+function set_scheduler
+{
+ for i in auto on off ; do
+ sleep 0.1
+ zpool set scheduler=$i $TESTPOOL1 $FILEDEV
+ done
+}
+
+function cleanup
+{
+ destroy_pool $TESTPOOL1
+ log_must rm -f $FILEDEV
+}
+
+log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic"
+log_onexit cleanup
+
+# 1. Create a pool
+
+FILEDEV="$TEST_BASE_DIR/filedev.$$"
+log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
+create_pool $TESTPOOL1 $FILEDEV
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1)
+
+# 2. Write a file to the pool, while also setting the scheduler property.
+
+log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
+ --rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
+ --ioengine=sync --time_based --runtime=2 &"
+
+ITERATIONS=4
+
+for i in $(seq $ITERATIONS); do
+ log_must set_scheduler
+done;
+wait
+
+# 3. Starting reading from file, while also setting the scheduler property.
+
+log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
+ --rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
+ --ioengine=sync --time_based --runtime=2 &"
+
+for i in $(seq $ITERATIONS); do
+ log_must set_scheduler
+done;
+wait
+
+log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"