From f375b23c026aec00cc9527470084191b5071d9b2 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Tue, 24 May 2022 12:43:22 -0400 Subject: [PATCH] Tiered early abort, zstd edition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that "do LZ4 and zstd-1 both fail" is a great heuristic for "don't even bother trying higher zstd tiers". By way of illustration: $ cat /incompress | mbuffer | zfs recv -o compression=zstd-12 evenfaster/lowcomp_1M_zstd12_normal summary: 39.8 GiByte in 3min 40.2sec - average of 185 MiB/s $ echo 3 | sudo tee /sys/module/zzstd/parameters/zstd_lz4_pass 3 $ cat /incompress | mbuffer -m 4G | zfs recv -o compression=zstd-12 evenfaster/lowcomp_1M_zstd12_patched summary: 39.8 GiByte in 48.6sec - average of 839 MiB/s $ sudo zfs list -p -o name,used,lused,ratio evenfaster/lowcomp_1M_zstd12_normal evenfaster/lowcomp_1M_zstd12_patched NAME USED LUSED RATIO evenfaster/lowcomp_1M_zstd12_normal 39549931520 42721221632 1.08 evenfaster/lowcomp_1M_zstd12_patched 39626399744 42721217536 1.07 $ python3 -c "print(39626399744 - 39549931520)" 76468224 $ I'll take 76 MB out of 42 GB for > 4x speedup. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Kjeld Schouten Reviewed-by: Ahelenia ZiemiaƄska Signed-off-by: Rich Ercolani Closes #13244 --- include/sys/zstd/zstd.h | 4 ++ man/man4/zfs.4 | 8 +++ module/zfs/zio_compress.c | 2 +- module/zstd/zfs_zstd.c | 126 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 134 insertions(+), 6 deletions(-) diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h index ca32a7464..ec2341b76 100644 --- a/include/sys/zstd/zstd.h +++ b/include/sys/zstd/zstd.h @@ -78,6 +78,8 @@ typedef struct zfs_zstd_meta { * kstat helper macros */ #define ZSTDSTAT(stat) (zstd_stats.stat.value.ui64) +#define ZSTDSTAT_ZERO(stat) \ + (atomic_store_64(&zstd_stats.stat.value.ui64, 0)) #define ZSTDSTAT_ADD(stat, val) \ atomic_add_64(&zstd_stats.stat.value.ui64, (val)) #define ZSTDSTAT_SUB(stat, val) \ @@ -90,6 +92,8 @@ void zstd_fini(void); size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int level); +size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, + size_t d_len, int level); int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level); int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, size_t d_len, uint8_t *level); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5ef517c46..c95fa98c5 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2129,6 +2129,14 @@ However, if there are fewer than metaslabs in the vdev, this functionality is disabled. This ensures that we don't set aside an unreasonable amount of space for the ZIL. . +.It Sy zfs_zstd_earlyabort_pass Ns = Ns Sy 1 Pq int +Whether heuristic for detection of incompressible data with zstd levels >= 3 +using LZ4 and zstd-1 passes is enabled. +. +.It Sy zfs_zstd_abort_size Ns = Ns Sy 131072 Pq int +Minimal uncompressed size (inclusive) of a record before the early abort +heuristic will be attempted. +. .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int If non-zero, the zio deadman will produce debugging messages .Pq see Sy zfs_dbgmsg_enable diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index cded11f4c..38020ce22 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -66,7 +66,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, {"zle", 64, zle_compress, zle_decompress, NULL}, {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, - {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap, zfs_zstd_decompress, zfs_zstd_decompress_level}, }; diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c index 04e52ae3c..413518989 100644 --- a/module/zstd/zfs_zstd.c +++ b/module/zstd/zfs_zstd.c @@ -50,6 +50,10 @@ #include "lib/zstd.h" #include "lib/common/zstd_errors.h" +static int zstd_earlyabort_pass = 1; +static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; +static unsigned int zstd_abort_size = (128 * 1024); + static kstat_t *zstd_ksp = NULL; typedef struct zstd_stats { @@ -62,6 +66,21 @@ typedef struct zstd_stats { kstat_named_t zstd_stat_dec_header_inval; kstat_named_t zstd_stat_com_fail; kstat_named_t zstd_stat_dec_fail; + /* + * LZ4 first-pass early abort verdict + */ + kstat_named_t zstd_stat_lz4pass_allowed; + kstat_named_t zstd_stat_lz4pass_rejected; + /* + * zstd-1 second-pass early abort verdict + */ + kstat_named_t zstd_stat_zstdpass_allowed; + kstat_named_t zstd_stat_zstdpass_rejected; + /* + * We excluded this from early abort for some reason + */ + kstat_named_t zstd_stat_passignored; + kstat_named_t zstd_stat_passignored_size; kstat_named_t zstd_stat_buffers; kstat_named_t zstd_stat_size; } zstd_stats_t; @@ -76,10 +95,44 @@ static zstd_stats_t zstd_stats = { { "decompress_header_invalid", KSTAT_DATA_UINT64 }, { "compress_failed", KSTAT_DATA_UINT64 }, { "decompress_failed", KSTAT_DATA_UINT64 }, + { "lz4pass_allowed", KSTAT_DATA_UINT64 }, + { "lz4pass_rejected", KSTAT_DATA_UINT64 }, + { "zstdpass_allowed", KSTAT_DATA_UINT64 }, + { "zstdpass_rejected", KSTAT_DATA_UINT64 }, + { "passignored", KSTAT_DATA_UINT64 }, + { "passignored_size", KSTAT_DATA_UINT64 }, { "buffers", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, }; +#ifdef _KERNEL +static int +kstat_zstd_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE && ksp == zstd_ksp) { + ZSTDSTAT_ZERO(zstd_stat_alloc_fail); + ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); + ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); + ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); + ZSTDSTAT_ZERO(zstd_stat_com_inval); + ZSTDSTAT_ZERO(zstd_stat_dec_inval); + ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); + ZSTDSTAT_ZERO(zstd_stat_com_fail); + ZSTDSTAT_ZERO(zstd_stat_dec_fail); + ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); + ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); + ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); + ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); + ZSTDSTAT_ZERO(zstd_stat_passignored); + ZSTDSTAT_ZERO(zstd_stat_passignored_size); + } + + return (0); +} +#endif + /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ enum zstd_kmem_type { ZSTD_KMEM_UNKNOWN = 0, @@ -377,6 +430,64 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) } +size_t +zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level) +{ + int16_t zstd_level; + if (zstd_enum_to_level(level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_com_inval); + return (s_len); + } + /* + * A zstd early abort heuristic. + * + * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently + * 128k), don't try any of this, just go. + * (because experimentally that was a reasonable cutoff for a perf win + * with tiny ratio change) + * - First, we try LZ4 compression, and if it doesn't early abort, we + * jump directly to whatever compression level we intended to try. + * - Second, we try zstd-1 - if that errors out (usually, but not + * exclusively, if it would overflow), we give up early. + * + * If it works, instead we go on and compress anyway. + * + * Why two passes? LZ4 alone gets you a lot of the way, but on highly + * compressible data, it was losing up to 8.5% of the compressed + * savings versus no early abort, and all the zstd-fast levels are + * worse indications on their own than LZ4, and don't improve the LZ4 + * pass noticably if stacked like this. + */ + size_t actual_abort_size = zstd_abort_size; + if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && + s_len >= actual_abort_size) { + int pass_len = 1; + pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); + if (pass_len < d_len) { + ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); + goto keep_trying; + } + ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); + + pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, + ZIO_ZSTD_LEVEL_1); + if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { + ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); + return (s_len); + } + ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); + } else { + ZSTDSTAT_BUMP(zstd_stat_passignored); + if (s_len < actual_abort_size) { + ZSTDSTAT_BUMP(zstd_stat_passignored_size); + } + } +keep_trying: + return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); + +} + /* Compress block using zstd */ size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, @@ -437,8 +548,10 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, * too small, that is not a failure. Everything else is a * failure, so increment the compression failure counter. */ - if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) { + int err = ZSTD_getErrorCode(c_len); + if (err != ZSTD_error_dstSize_tooSmall) { ZSTDSTAT_BUMP(zstd_stat_com_fail); + dprintf("Error: %s", ZSTD_getErrorString(err)); } return (s_len); } @@ -753,6 +866,9 @@ zstd_init(void) if (zstd_ksp != NULL) { zstd_ksp->ks_data = &zstd_stats; kstat_install(zstd_ksp); +#ifdef _KERNEL + zstd_ksp->ks_update = kstat_zstd_update; +#endif } return (0); @@ -781,8 +897,8 @@ module_init(zstd_init); module_exit(zstd_fini); #endif -EXPORT_SYMBOL(zfs_zstd_compress); -EXPORT_SYMBOL(zfs_zstd_decompress_level); -EXPORT_SYMBOL(zfs_zstd_decompress); -EXPORT_SYMBOL(zfs_zstd_cache_reap_now); +ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, INT, ZMOD_RW, + "Enable early abort attempts when using zstd"); +ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, + "Minimal size of block to attempt early abort"); #endif