zvol: Support blk-mq for better performance

Add support for the kernel's block multiqueue (blk-mq) interface in
the zvol block driver.  blk-mq creates multiple request queues on
different CPUs rather than having a single request queue.  This can
improve zvol performance with multithreaded reads/writes.

This implementation uses the blk-mq interfaces on 4.13 or newer
kernels.  Building against older kernels will fall back to the
older BIO interfaces.

Note that you must set the `zvol_use_blk_mq` module param to
enable the blk-mq API.  It is disabled by default.

In addition, this commit lets the zvol blk-mq layer process whole
`struct request` IOs at a time, rather than breaking them down
into their individual BIOs.  This reduces dbuf lock contention
and overhead versus the legacy zvol submit_bio() codepath.

	sequential dd to one zvol, 8k volblocksize, no O_DIRECT:

	legacy submit_bio()     292MB/s write  453MB/s read
	this commit             453MB/s write  885MB/s read

It also introduces a new `zvol_blk_mq_chunks_per_thread` module
parameter. This parameter represents how many volblocksize'd chunks
to process per each zvol thread.  It can be used to tune your zvols
for better read vs write performance (higher values favor write,
lower favor read).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13148
Issue #12483
This commit is contained in:
Tony Hutter
2022-06-09 07:10:38 -07:00
committed by GitHub
parent 985c33b132
commit 6f73d02168
18 changed files with 1441 additions and 152 deletions
+2
View File
@@ -120,10 +120,12 @@ export SYSTEM_FILES_FREEBSD='chflags
showmount
swapctl
sysctl
trim
uncompress'
export SYSTEM_FILES_LINUX='attr
blkid
blkdiscard
blockdev
chattr
exportfs
+32 -13
View File
@@ -2770,20 +2770,22 @@ function is_te_enabled
svcs -H -o state labeld 2>/dev/null | grep -q "enabled"
}
# Return the number of CPUs (cross-platform)
function get_num_cpus
{
if is_linux ; then
grep -c '^processor' /proc/cpuinfo
elif is_freebsd; then
sysctl -n kern.smp.cpus
else
psrinfo | wc -l
fi
}
# Utility function to determine if a system has multiple cpus.
function is_mp
{
case "$UNAME" in
Linux)
(($(grep -c '^processor' /proc/cpuinfo) > 1))
;;
FreeBSD)
sysctl -n kern.smp.cpus
;;
*)
(($(psrinfo | wc -l) > 1))
;;
esac
[[ $(get_num_cpus) -gt 1 ]]
}
function get_cpu_freq
@@ -3320,14 +3322,23 @@ function get_tunable_impl
{
typeset name="$1"
typeset module="${2:-zfs}"
typeset check_only="$3"
eval "typeset tunable=\$$name"
case "$tunable" in
UNSUPPORTED)
log_unsupported "Tunable '$name' is unsupported on $UNAME"
if [ -z "$check_only" ] ; then
log_unsupported "Tunable '$name' is unsupported on $UNAME"
else
return 1
fi
;;
"")
log_fail "Tunable '$name' must be added to tunables.cfg"
if [ -z "$check_only" ] ; then
log_fail "Tunable '$name' must be added to tunables.cfg"
else
return 1
fi
;;
*)
;;
@@ -3347,6 +3358,14 @@ function get_tunable_impl
esac
}
# Does a tunable exist?
#
# $1: Tunable name
function tunable_exists
{
get_tunable_impl $1 "zfs" 1
}
#
# Compute MD5 digest for given file or stdin if no file given.
# Note: file path must not contain spaces
+1
View File
@@ -87,6 +87,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
+5
View File
@@ -1966,11 +1966,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
functional/zvol/zvol_misc/zvol_misc_fua.ksh \
functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
functional/zvol/zvol_misc/zvol_misc_trim.ksh \
functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
functional/zvol/zvol_misc/zvol_misc_zil.ksh \
functional/zvol/zvol_stress/cleanup.ksh \
functional/zvol/zvol_stress/setup.ksh \
functional/zvol/zvol_stress/zvol_stress.ksh \
functional/zvol/zvol_swap/cleanup.ksh \
functional/zvol/zvol_swap/setup.ksh \
functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
@@ -128,3 +128,14 @@ function is_zvol_dumpified
zdb -dddd $volume 2 | grep -q "dumpsize"
}
# enable/disable blk-mq (if available)
#
# $1: 1 = enable, 0 = disable
function set_blk_mq
{
# Not all kernels support blk-mq
if tunable_exists VOL_USE_BLK_MQ ; then
log_must set_tunable32 VOL_USE_BLK_MQ $1
fi
}
@@ -0,0 +1,96 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
#
# DESCRIPTION:
# Verify that a zvol Force Unit Access (FUA) write works.
#
# STRATEGY:
# 1. dd write 5MB of data with "oflag=dsync,direct" to a zvol. Those flags
# together do a FUA write.
# 3. Verify the data is correct.
# 3. Repeat 1-2 for both the blk-mq and non-blk-mq cases.
verify_runnable "global"
if ! is_physical_device $DISKS; then
log_unsupported "This directory cannot be run on raw files."
fi
if ! is_linux ; then
log_unsupported "Only linux supports dd with oflag=dsync for FUA writes"
fi
typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)"
typeset datafile2="$(mktemp zvol_misc_fua2.XXXXXX)"
typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL
function cleanup
{
rm "$datafile1" "$datafile2"
}
function do_test {
# Wait for udev to create symlinks to our zvol
block_device_wait $zvolpath
# Create a data file
log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5
# Write the data to our zvol using FUA
log_must dd if=$datafile1 of=$zvolpath oflag=dsync,direct bs=1M count=5
# Extract data from our zvol
log_must dd if=$zvolpath of="$datafile2" bs=1M count=5
# Compare the data we expect with what's on our zvol. diff will return
# non-zero if they differ.
log_must diff $datafile1 $datafile2
log_must rm $datafile1 $datafile2
}
log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)"
log_onexit cleanup
log_must zfs set compression=off $TESTPOOL/$TESTVOL
log_note "Testing without blk-mq"
set_blk_mq 0
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
do_test
set_blk_mq 1
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
do_test
log_pass "ZFS volume FUA works"
@@ -0,0 +1,136 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/include/math.shlib
. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
#
# DESCRIPTION:
# Verify we can TRIM a zvol
#
# STRATEGY:
# 1. TRIM the entire zvol to remove data from older tests
# 2. Create a 5MB data file
# 3. Write the file to the zvol
# 4. Observe 5MB of used space on the zvol
# 5. TRIM the first 1MB and last 2MB of the 5MB block of data.
# 6. Observe 2MB of used space on the zvol
# 7. Verify the trimmed regions are zero'd on the zvol
verify_runnable "global"
if is_linux ; then
# We need '--force' here since the prior tests may leave a filesystem
# on the zvol, and blkdiscard will see that filesystem and print a
# warning unless you force it.
#
# Only blkdiscard >= v2.36 supports --force, so we need to
# check for it.
if blkdiscard --help | grep -q '\-\-force' ; then
trimcmd='blkdiscard --force'
else
trimcmd='blkdiscard'
fi
else
# By default, FreeBSD 'trim' always does a dry-run. '-f' makes
# it perform the actual operation.
trimcmd='trim -f'
fi
if ! is_physical_device $DISKS; then
log_unsupported "This directory cannot be run on raw files."
fi
typeset datafile1="$(mktemp zvol_misc_flags1.XXXXXX)"
typeset datafile2="$(mktemp zvol_misc_flags2.XXXXXX)"
typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL
function cleanup
{
rm "$datafile1" "$datafile2"
}
function do_test {
# Wait for udev to create symlinks to our zvol
block_device_wait $zvolpath
# Create a data file
log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5
# Write to zvol
log_must dd if=$datafile1 of=$zvolpath conv=fsync
# Record how much space we've used (should be 5MB, with 128k
# of tolerance).
before="$(get_prop refer $TESTPOOL/$TESTVOL)"
log_must within_tolerance $before 5242880 131072
# We currently have 5MB of random data on the zvol.
# Trim the first 1MB and also trim 2MB at offset 3MB.
log_must $trimcmd -l $((1 * 1048576)) $zvolpath
log_must $trimcmd -o $((3 * 1048576)) -l $((2 * 1048576)) $zvolpath
sync_pool
# After trimming 3MB, the zvol should have 2MB of data (with 128k of
# tolerance).
after="$(get_prop refer $TESTPOOL/$TESTVOL)"
log_must within_tolerance $after 2097152 131072
# Make the same holes in our test data
log_must dd if=/dev/zero of="$datafile1" bs=1M count=1 conv=notrunc
log_must dd if=/dev/zero of="$datafile1" bs=1M count=2 seek=3 conv=notrunc
# Extract data from our zvol
log_must dd if=$zvolpath of="$datafile2" bs=1M count=5
# Compare the data we expect with what's on our zvol. diff will return
# non-zero if they differ.
log_must diff $datafile1 $datafile2
log_must rm $datafile1 $datafile2
}
log_assert "Verify that a ZFS volume can be TRIMed"
log_onexit cleanup
log_must zfs set compression=off $TESTPOOL/$TESTVOL
# Remove old data from previous tests
log_must $trimcmd $zvolpath
set_blk_mq 1
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
do_test
set_blk_mq 0
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
do_test
log_pass "ZFS volumes can be trimmed"
@@ -0,0 +1,36 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
default_cleanup
@@ -0,0 +1,36 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
default_setup "$DISKS"
@@ -0,0 +1,169 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/reservation/reservation.shlib
. $STF_SUITE/tests/functional/zvol/zvol_common.shlib
#
# DESCRIPTION:
# Stress test multithreaded transfers to multiple zvols. Also verify
# zvol errors show up in zpool status.
#
# STRATEGY:
#
# For both the normal submit_bio() codepath and the blk-mq codepath, do
# the following:
#
# 1. Create one zvol per CPU
# 2. In parallel, spawn an fio "write and verify" for each zvol
# 3. Inject write errors
# 4. Write to one of the zvols with dd and verify the errors
#
verify_runnable "global"
num_zvols=$(get_num_cpus)
# If we were making one big zvol from all the pool space, it would
# be this big:
biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL)
# Crude calculation: take the biggest zvol size we could possibly
# create, knock 10% off it (for overhead) and divide by the number
# of ZVOLs we want to make.
#
# Round the value using a printf
typeset -f each_zvol_size=$(( floor($biggest_zvol_size_possible * 0.9 / \
$num_zvols )))
typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)"
function create_zvols
{
log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each"
for i in $(seq $num_zvols) ; do
log_must zfs create -V $each_zvol_size $TESTPOOL/testvol$i
block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/testvol$i"
done
}
function destroy_zvols
{
for i in $(seq $num_zvols) ; do
log_must_busy zfs destroy $TESTPOOL/testvol$i
done
}
function do_zvol_stress
{
# Write 10% of each zvol, or 50MB, whichever is less
zvol_write_size=$((each_zvol_size / 10))
if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then
zvol_write_size=$((50 * 1048576))
fi
zvol_write_size_mb=$(($zvol_write_size / 1048576))
if is_linux ; then
engine=libaio
else
engine=psync
fi
# Spawn off one fio per zvol in parallel
pids=""
for i in $(seq $num_zvols) ; do
# Spawn one fio per zvol as its own process
fio --ioengine=$engine --name=zvol_stress$i --direct=0 \
--filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \
--iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \
--verify_async=2 --numjobs=1 --verify=sha1 \
--verify_fatal=1 \
--continue_on_error=none \
--error_dump=1 \
--exitall_on_error \
--aux-path="$tmpdir" --do_verify=1 &
pids="$pids $!"
done
# Wait for all the spawned fios to finish and look for errors
fail=""
i=0
for pid in $pids ; do
log_note "$s waiting on $pid"
if ! wait $pid ; then
log_fail "fio error on $TESTPOOL/testvol$i"
fi
i=$(($i + 1))
done
}
function cleanup
{
log_must zinject -c all
log_must zpool clear $TESTPOOL
destroy_zvols
set_blk_mq 0
# Remove all fio's leftover state files
if [ -n "$tmpdir" ] ; then
log_must rm -fd "$tmpdir"/*.state "$tmpdir"
fi
}
log_onexit cleanup
log_assert "Stress test zvols"
set_blk_mq 0
create_zvols
# Do some fio write/verifies in parallel
do_zvol_stress
destroy_zvols
# Enable blk-mq (block multi-queue), and re-run the same test
set_blk_mq 1
create_zvols
do_zvol_stress
# Inject some errors, and verify we see some IO errors in zpool status
for DISK in $DISKS ; do
log_must zinject -d $DISK -f 10 -e io -T write $TESTPOOL
done
log_must dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=512 count=50
log_must zinject -c all
# We should see write errors
typeset -i write_errors=$(zpool status -p | awk '
!NF { isvdev = 0 }
isvdev { errors += $4 }
/CKSUM$/ { isvdev = 1 }
END { print errors }
')
if [ $write_errors -eq 0 ] ; then
log_fail "Expected to see some write errors"
else
log_note "Correctly saw $write_errors write errors"
fi
log_pass "Done with zvol_stress"