zvol: Fix blk-mq sync

The zvol blk-mq codepaths would erroneously send FLUSH and TRIM commands down the read codepath, rather than write. This fixes the issue, and updates the zvol_misc_fua test to verify that sync writes are actually happening. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <alexander.motin@TrueNAS.com> Reviewed-by: Ameer Hamza <ahamza@ixsystems.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #17761 Closes #17765
2026-01-25 10:12:13 +03:00 · 2025-09-29 16:29:20 -07:00 · 2025-09-29 16:29:20 -07:00 · 0bb5950e72
commit 0bb5950e72
parent 1baecd3a78
3 changed files with 61 additions and 20 deletions
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id)
 }
 #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */

-/*
- * All the io_*() helper functions below can operate on a bio, or a rq, but
- * not both.  The older submit_bio() codepath will pass a bio, and the
- * newer blk-mq codepath will pass a rq.
- */
-static inline int
-io_data_dir(struct bio *bio, struct request *rq)
-{
-	if (rq != NULL) {
-		if (op_is_write(req_op(rq))) {
-			return (WRITE);
-		} else {
-			return (READ);
-		}
-	}
-	return (bio_data_dir(bio));
-}
-
 static inline int
 io_is_flush(struct bio *bio, struct request *rq)
 {
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@ -523,7 +523,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
-	int rw = io_data_dir(bio, rq);
+	int rw;
+
+	if (rq != NULL) {
+		/*
+		 * Flush & trim requests go down the zvol_write codepath.  Or
+		 * more specifically:
+		 *
+		 * If request is a write, or if it's op_is_sync() and not a
+		 * read, or if it's a flush, or if it's a discard, then send the
+		 * request down the write path.
+		 */
+		if (op_is_write(rq->cmd_flags) ||
+		    (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) ||
+		    req_op(rq) == REQ_OP_FLUSH ||
+		    op_is_discard(rq->cmd_flags)) {
+			rw = WRITE;
+		} else {
+			rw = READ;
+		}
+	} else {
+		rw = bio_data_dir(bio);
+	}

 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
--- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
+++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh
@ -50,17 +50,53 @@ fi

 typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)"
 typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)"
+typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)"
 typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL

+typeset DISK1=${DISKS%% *}
 function cleanup
 {
-       rm "$datafile1" "$datafile2"
+	log_must zpool remove $TESTPOOL $datafile3
+	rm "$datafile1" "$datafile2" "$datafile2"
+}
+
+# Prints the total number of sync writes for a vdev
+# $1: vdev
+function get_sync
+{
+	zpool iostat -p -H -v -r $TESTPOOL $1 | \
+	    awk '/[0-9]+$/{s+=$4+$5} END{print s}'
 }

 function do_test {
 	# Wait for udev to create symlinks to our zvol
 	block_device_wait $zvolpath

+	# Write using sync (creates FLUSH calls after writes, but not FUA)
+	old_vdev_writes=$(get_sync $DISK1)
+	old_log_writes=$(get_sync $datafile3)
+
+	log_must fio --name=write_iops --size=5M \
+		--ioengine=libaio --verify=0 --bs=4K \
+		--iodepth=1 --rw=randwrite --group_reporting=1 \
+		--filename=$zvolpath --sync=1
+
+	vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes))
+	log_writes=$(( $(get_sync $datafile3) - $old_log_writes))
+
+	# When we're doing sync writes, we should see many more writes go to
+	# the log vs the first vdev.  Experiments show anywhere from a 160-320x
+	# ratio of writes to the log vs the first vdev (due to some straggler
+	# writes to the first vdev).
+	#
+	# Check that we have a large ratio (100x) of sync writes going to the
+	# log device
+	ratio=$(($log_writes / $vdev_writes))
+	log_note "Got $log_writes log writes, $vdev_writes vdev writes."
+	if [ $ratio -lt 100 ] ; then
+		log_fail "Expected > 100x more log writes than vdev writes. "
+	fi
+
 	# Create a data file
 	log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5

@ -81,6 +117,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)"
 log_onexit cleanup

 log_must zfs set compression=off $TESTPOOL/$TESTVOL
+log_must truncate -s 100M $datafile3
+log_must zpool add $TESTPOOL log $datafile3

 log_note "Testing without blk-mq"