From f2330bd1568489ae1fb16d975a5a9bcfe12ed219 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 28 Apr 2022 18:12:24 -0400 Subject: [PATCH] Default zfs_max_recordsize to 16M Increase the default allowed maximum recordsize from 1M to 16M. As described in the zfs(4) man page, there are significant costs which need to be considered before using very large blocks. However, there are scenarios where they make good sense and it should no longer be necessary to artificially restrict their use behind a module option. Note that for 32-bit platforms we continue to leave this restriction in place due to the limited virtual address space available (256-512MB). On these systems only a handful of blocks could be cached at any one time severely impacting performance and potentially stability. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rich Ercolani Closes #12830 Closes #13302 --- man/man4/zfs.4 | 6 ++--- module/zfs/dsl_dataset.c | 25 ++++++++++--------- module/zfs/zio.c | 9 ------- .../alloc_class/alloc_class_011_neg.ksh | 2 +- .../zfs_create/zfs_create_008_neg.ksh | 2 +- .../cli_root/zfs_set/zfs_set_001_neg.ksh | 2 +- .../zpool_create/zpool_create_023_neg.ksh | 2 +- 7 files changed, 20 insertions(+), 28 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index a18917eb1..290ecd22e 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1469,15 +1469,15 @@ feature uses to estimate incoming log blocks. .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq ulong Maximum number of rows allowed in the summary of the spacemap log. . -.It Sy zfs_max_recordsize Ns = Ns Sy 1048576 Po 1MB Pc Pq int +.It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16MB Pc Pq int We currently support block sizes from .Em 512B No to Em 16MB . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. -Therefore, we do not allow the recordsize to be set larger than this tunable. -Larger blocks can be created by changing it, +Therefore, we formerly forbade creating blocks larger than 1M. +Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. . diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2d98c2f04..ca894c352 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -73,12 +73,19 @@ * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. + * we did not allow the recordsize to be set larger than zfs_max_recordsize + * (former default: 1MB). Larger blocks could be created by changing this + * tunable, and pools with larger blocks could always be imported and used, + * regardless of this setting. + * + * We do, however, still limit it by default to 1M on x86_32, because Linux's + * 3/1 memory split doesn't leave much room for 16M chunks. */ -int zfs_max_recordsize = 1 * 1024 * 1024; +#ifdef _ILP32 +int zfs_max_recordsize = 1 * 1024 * 1024; +#else +int zfs_max_recordsize = 16 * 1024 * 1024; +#endif static int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ @@ -4964,13 +4971,7 @@ dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, return (0); } -#if defined(_LP64) -#define RECORDSIZE_PERM ZMOD_RW -#else -/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -#define RECORDSIZE_PERM ZMOD_RD -#endif -ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index f6adea572..2a16d5cef 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -166,15 +166,6 @@ zio_init(void) cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; -#if defined(_ILP32) && defined(_KERNEL) - /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. - */ - if (size > zfs_max_recordsize) - break; -#endif - while (!ISP2(p2)) p2 &= p2 - 1; diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index d804e5371..0be49b858 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -35,7 +35,7 @@ log_must disk_setup log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 2097152 +for value in 256 1025 33554432 do log_mustnot zfs set special_small_blocks=$value $TESTPOOL done diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh index a905e50df..d82f10f71 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh @@ -56,7 +56,7 @@ set -A args "ab" "-?" "-cV" "-Vc" "-c -V" "c" "V" "--c" "-e" "-s" \ "-blah" "-cV 12k" "-s -cV 1P" "-sc" "-Vs 5g" "-o" "--o" "-O" "--O" \ "-o QuOta=none" "-o quota=non" "-o quota=abcd" "-o quota=0" "-o quota=" \ "-o ResErVaTi0n=none" "-o reserV=none" "-o reservation=abcd" "-o reserv=" \ - "-o recorDSize=64k" "-o recordsize=2048K" "-o recordsize=2M" \ + "-o recorDSize=64k" "-o recordsize=32768K" "-o recordsize=32M" \ "-o recordsize=256" "-o recsize=" "-o recsize=zero" "-o recordsize=0" \ "-o mountPoint=/tmp/tmpfile$$" "-o mountpoint=non0" "-o mountpoint=" \ "-o mountpoint=LEGACY" "-o mounpoint=none" \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh index 5cfaec55e..e58fe9bfe 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh @@ -78,7 +78,7 @@ while (( i < ${#dataset[@]} )); do (( j += 1 )) done # Additional recordsize - set_n_check_prop "2048K" "recordsize" "${dataset[i]}" false + set_n_check_prop "32768K" "recordsize" "${dataset[i]}" false set_n_check_prop "128B" "recordsize" "${dataset[i]}" false (( i += 1 )) done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh index f101521bd..780cf86d6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh @@ -52,7 +52,7 @@ log_onexit cleanup set -A args "QuOta=none" "quota=non" "quota=abcd" "quota=0" "quota=" \ "ResErVaTi0n=none" "reserV=none" "reservation=abcd" "reserv=" \ - "recorDSize=64k" "recordsize=2M" "recordsize=2048K" \ + "recorDSize=64k" "recordsize=32M" "recordsize=32768K" \ "recordsize=256" "recsize=" "recsize=zero" "recordsize=0" \ "mountPoint=/tmp/tmpfile$$" "mountpoint=non0" "mountpoint=" \ "mountpoint=LEGACY" "mounpoint=none" \