From 8f1b7a6fa6762ea4c89198ceb11c521f80b92ddc Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 3 May 2024 08:18:35 +1000 Subject: [PATCH] vdev_disk: disable flushes if device does not support it If the underlying device doesn't have a write-back cache, the kernel will just return a successful response. This doesn't hurt anything, but it's extra work on the IO taskqs that are unnecessary. So, detect this when we open the device for the first time. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16148 --- include/os/linux/kernel/linux/blkdev_compat.h | 27 +++++++++++++++++++ module/os/linux/zfs/vdev_disk.c | 7 +++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index b0f398354..658f54621 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -94,6 +94,33 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) #endif } +/* + * Detect if a device has a write cache. Used to set the intial value for the + * vdev nowritecache flag. + * + * 4.10: QUEUE_FLAG_WC added. Initialised by the driver, but can be changed + * later by the operator. If not set, kernel will return flush requests + * immediately without doing anything. + * 6.6: QUEUE_FLAG_HW_WC added. Initialised by the driver, can't be changed. + * Only controls if the operator is allowed to change _WC. Initial version + * buggy; aliased to QUEUE_FLAG_FUA, so unuseable. + * 6.6.10, 6.7: QUEUE_FLAG_HW_WC fixed. + * + * Older than 4.10 we just assume write cache, and let the normal flush fail + * detection apply. + */ +static inline boolean_t +zfs_bdev_has_write_cache(struct block_device *bdev) +{ +#if defined(QUEUE_FLAG_HW_WC) && QUEUE_FLAG_HW_WC != QUEUE_FLAG_FUA + return (test_bit(QUEUE_FLAG_HW_WC, &bdev_get_queue(bdev)->queue_flags)); +#elif defined(QUEUE_FLAG_WC) + return (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags)); +#else + return (B_TRUE); +#endif +} + static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 2cea61a62..463c5f705 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -429,8 +429,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(bdev); - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - v->vdev_nowritecache = B_FALSE; + /* + * If the device has a write cache, clear the nowritecache flag, + * so that we start issuing flush requests again. + */ + v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(bdev);