diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index 15dbe1c7d..2f0b386e6 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -332,7 +332,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [ ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [ AC_MSG_RESULT(yes) ],[ - ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors]) + AC_MSG_RESULT(no) ]) ]) @@ -355,7 +355,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blk_queue_max_segments]) + AC_MSG_RESULT(no) ]) ]) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 1d5d54b80..c01caa6da 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1076,8 +1076,106 @@ static const struct block_device_operations zvol_ops = { #endif }; +typedef struct zvol_queue_limits { + unsigned int zql_max_hw_sectors; + unsigned short zql_max_segments; + unsigned int zql_max_segment_size; + unsigned int zql_io_opt; +} zvol_queue_limits_t; + +static void +zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, + boolean_t use_blk_mq) +{ + limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; + + if (use_blk_mq) { + /* + * IO requests can be really big (1MB). When an IO request + * comes in, it is passed off to zvol_read() or zvol_write() + * in a new thread, where it is chunked up into 'volblocksize' + * sized pieces and processed. So for example, if the request + * is a 1MB write and your volblocksize is 128k, one zvol_write + * thread will take that request and sequentially do ten 128k + * IOs. This is due to the fact that the thread needs to lock + * each volblocksize sized block. So you might be wondering: + * "instead of passing the whole 1MB request to one thread, + * why not pass ten individual 128k chunks to ten threads and + * process the whole write in parallel?" The short answer is + * that there's a sweet spot number of chunks that balances + * the greater parallelism with the added overhead of more + * threads. The sweet spot can be different depending on if you + * have a read or write heavy workload. Writes typically want + * high chunk counts while reads typically want lower ones. On + * a test pool with 6 NVMe drives in a 3x 2-disk mirror + * configuration, with volblocksize=8k, the sweet spot for good + * sequential reads and writes was at 8 chunks. + */ + + /* + * Below we tell the kernel how big we want our requests + * to be. You would think that blk_queue_io_opt() would be + * used to do this since it is used to "set optimal request + * size for the queue", but that doesn't seem to do + * anything - the kernel still gives you huge requests + * with tons of little PAGE_SIZE segments contained within it. + * + * Knowing that the kernel will just give you PAGE_SIZE segments + * no matter what, you can say "ok, I want PAGE_SIZE byte + * segments, and I want 'N' of them per request", where N is + * the correct number of segments for the volblocksize and + * number of chunks you want. + */ +#ifdef HAVE_BLK_MQ + if (zvol_blk_mq_blocks_per_thread != 0) { + unsigned int chunks; + chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); + + limits->zql_max_segment_size = PAGE_SIZE; + limits->zql_max_segments = + (zv->zv_volblocksize * chunks) / PAGE_SIZE; + } else { + /* + * Special case: zvol_blk_mq_blocks_per_thread = 0 + * Max everything out. + */ + limits->zql_max_segments = UINT16_MAX; + limits->zql_max_segment_size = UINT_MAX; + } + } else { +#endif + limits->zql_max_segments = UINT16_MAX; + limits->zql_max_segment_size = UINT_MAX; + } + + limits->zql_io_opt = zv->zv_volblocksize; +} + +#ifdef HAVE_BLK_ALLOC_DISK_2ARG +static void +zvol_queue_limits_convert(zvol_queue_limits_t *limits, + struct queue_limits *qlimits) +{ + memset(qlimits, 0, sizeof (struct queue_limits)); + qlimits->max_hw_sectors = limits->zql_max_hw_sectors; + qlimits->max_segments = limits->zql_max_segments; + qlimits->max_segment_size = limits->zql_max_segment_size; + qlimits->io_opt = limits->zql_io_opt; +} +#else +static void +zvol_queue_limits_apply(zvol_queue_limits_t *limits, + struct request_queue *queue) +{ + blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); + blk_queue_max_segments(queue, limits->zql_max_segments); + blk_queue_max_segment_size(queue, limits->zql_max_segment_size); + blk_queue_io_opt(queue, limits->zql_io_opt); +} +#endif + static int -zvol_alloc_non_blk_mq(struct zvol_state_os *zso) +zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) @@ -1087,8 +1185,11 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) - struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + struct queue_limits qlimits; + zvol_queue_limits_convert(limits, &qlimits); + struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); if (IS_ERR(disk)) { zso->zvo_disk = NULL; return (1); @@ -1109,6 +1210,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) } zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); @@ -1122,13 +1224,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) } zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ return (0); } static int -zvol_alloc_blk_mq(zvol_state_t *zv) +zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) { #ifdef HAVE_BLK_MQ struct zvol_state_os *zso = zv->zv_zso; @@ -1144,9 +1247,12 @@ zvol_alloc_blk_mq(zvol_state_t *zv) return (1); } zso->zvo_queue = zso->zvo_disk->queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); zso->zvo_disk->minors = ZVOL_MINORS; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) - struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + struct queue_limits qlimits; + zvol_queue_limits_convert(limits, &qlimits); + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); if (IS_ERR(disk)) { zso->zvo_disk = NULL; blk_mq_free_tag_set(&zso->tag_set); @@ -1172,6 +1278,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv) /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; + zvol_queue_limits_apply(limits, zso->zvo_queue); #endif #endif @@ -1211,6 +1318,9 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #endif + zvol_queue_limits_t limits; + zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); + /* * The block layer has 3 interfaces for getting BIOs: * @@ -1227,10 +1337,10 @@ zvol_alloc(dev_t dev, const char *name) * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { - ret = zvol_alloc_blk_mq(zv); + ret = zvol_alloc_blk_mq(zv, &limits); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { - ret = zvol_alloc_non_blk_mq(zso); + ret = zvol_alloc_non_blk_mq(zso, &limits); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) @@ -1514,74 +1624,10 @@ zvol_os_create_minor(const char *name) set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); - blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, - (DMU_MAX_ACCESS / 4) >> 9); - if (zv->zv_zso->use_blk_mq) { - /* - * IO requests can be really big (1MB). When an IO request - * comes in, it is passed off to zvol_read() or zvol_write() - * in a new thread, where it is chunked up into 'volblocksize' - * sized pieces and processed. So for example, if the request - * is a 1MB write and your volblocksize is 128k, one zvol_write - * thread will take that request and sequentially do ten 128k - * IOs. This is due to the fact that the thread needs to lock - * each volblocksize sized block. So you might be wondering: - * "instead of passing the whole 1MB request to one thread, - * why not pass ten individual 128k chunks to ten threads and - * process the whole write in parallel?" The short answer is - * that there's a sweet spot number of chunks that balances - * the greater parallelism with the added overhead of more - * threads. The sweet spot can be different depending on if you - * have a read or write heavy workload. Writes typically want - * high chunk counts while reads typically want lower ones. On - * a test pool with 6 NVMe drives in a 3x 2-disk mirror - * configuration, with volblocksize=8k, the sweet spot for good - * sequential reads and writes was at 8 chunks. - */ - - /* - * Below we tell the kernel how big we want our requests - * to be. You would think that blk_queue_io_opt() would be - * used to do this since it is used to "set optimal request - * size for the queue", but that doesn't seem to do - * anything - the kernel still gives you huge requests - * with tons of little PAGE_SIZE segments contained within it. - * - * Knowing that the kernel will just give you PAGE_SIZE segments - * no matter what, you can say "ok, I want PAGE_SIZE byte - * segments, and I want 'N' of them per request", where N is - * the correct number of segments for the volblocksize and - * number of chunks you want. - */ -#ifdef HAVE_BLK_MQ - if (zvol_blk_mq_blocks_per_thread != 0) { - unsigned int chunks; - chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); - - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, - PAGE_SIZE); - blk_queue_max_segments(zv->zv_zso->zvo_queue, - (zv->zv_volblocksize * chunks) / PAGE_SIZE); - } else { - /* - * Special case: zvol_blk_mq_blocks_per_thread = 0 - * Max everything out. - */ - blk_queue_max_segments(zv->zv_zso->zvo_queue, - UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, - UINT_MAX); - } -#endif - } else { - blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); - blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); - } blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); - blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue,