From 320f0c6022e1c9bdc9063f849c6b2e4fa3b93995 Mon Sep 17 00:00:00 2001 From: Finix1979 Date: Fri, 9 Sep 2022 01:29:41 +0800 Subject: [PATCH] Add Linux posix_fadvise support The purpose of this PR is to accepts fadvise ioctl from userland to do read-ahead by demand. It could dramatically improve sequential read performance especially when primarycache is set to metadata or zfs_prefetch_disable is 1. If the file is mmaped, generic_fadvise is also called for page cache read-ahead besides dmu_prefetch. Only POSIX_FADV_WILLNEED and POSIX_FADV_SEQUENTIAL are supported in this PR currently. Reviewed-by: Brian Behlendorf Signed-off-by: Finix Yan Closes #13694 --- config/kernel-fadvise.m4 | 23 +++++ config/kernel-generic_fadvise.m4 | 27 ++++++ config/kernel.m4 | 4 + module/os/linux/zfs/zpl_file.c | 62 ++++++++++++ tests/runfiles/linux.run | 4 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 3 + tests/zfs-tests/cmd/file/file_fadvise.c | 97 +++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 3 + .../functional/checksum/filetest_002_pos.ksh | 2 +- .../tests/functional/fadvise/cleanup.ksh | 28 ++++++ .../functional/fadvise/fadvise_sequential.ksh | 80 +++++++++++++++ .../tests/functional/fadvise/setup.ksh | 30 ++++++ .../functional/fault/auto_spare_002_pos.ksh | 2 +- 15 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 config/kernel-fadvise.m4 create mode 100644 config/kernel-generic_fadvise.m4 create mode 100644 tests/zfs-tests/cmd/file/file_fadvise.c create mode 100755 tests/zfs-tests/tests/functional/fadvise/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh create mode 100755 tests/zfs-tests/tests/functional/fadvise/setup.ksh diff --git a/config/kernel-fadvise.m4 b/config/kernel-fadvise.m4 new file mode 100644 index 000000000..08912de16 --- /dev/null +++ b/config/kernel-fadvise.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 4.19 API +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FADVISE], [ + ZFS_LINUX_TEST_SRC([file_fadvise], [ + #include + + static const struct file_operations + fops __attribute__ ((unused)) = { + .fadvise = NULL, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FADVISE], [ + AC_MSG_CHECKING([whether fops->fadvise() exists]) + ZFS_LINUX_TEST_RESULT([file_fadvise], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_FADVISE, 1, [fops->fadvise() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-generic_fadvise.m4 b/config/kernel-generic_fadvise.m4 new file mode 100644 index 000000000..8d122064b --- /dev/null +++ b/config/kernel-generic_fadvise.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # 5.3 API change +dnl # The generic_fadvise() function is present since 4.19 kernel +dnl # but it was not exported until Linux 5.3. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FADVISE], [ + ZFS_LINUX_TEST_SRC([generic_fadvise], [ + #include + ], [ + struct file *fp __attribute__ ((unused)) = NULL; + loff_t offset __attribute__ ((unused)) = 0; + loff_t len __attribute__ ((unused)) = 0; + int advise __attribute__ ((unused)) = 0; + generic_fadvise(fp, offset, len, advise); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FADVISE], [ + AC_MSG_CHECKING([whether generic_fadvise() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_fadvise], + [generic_fadvise], [mm/fadvise.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_FADVISE, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1f274cbe4..6aad2cf88 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -42,6 +42,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE + ZFS_AC_KERNEL_SRC_FADVISE + ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED @@ -161,6 +163,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE + ZFS_AC_KERNEL_FADVISE + ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 43b7fb60a..b0d9f37a3 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -27,6 +27,7 @@ #ifdef CONFIG_COMPAT #include #endif +#include #include #include #include @@ -37,6 +38,9 @@ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif +#ifdef HAVE_FILE_FADVISE +#include +#endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif @@ -906,6 +910,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg) return (copy_to_user(arg, &generation, sizeof (generation))); } +#ifdef HAVE_FILE_FADVISE +static int +zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) +{ + struct inode *ip = file_inode(filp); + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os = zfsvfs->z_os; + int error = 0; + + if (S_ISFIFO(ip->i_mode)) + return (-ESPIPE); + + if (offset < 0 || len < 0) + return (-EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + switch (advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: +#ifdef HAVE_GENERIC_FADVISE + if (zn_has_cached_data(zp)) + error = generic_fadvise(filp, offset, len, advice); +#endif + /* + * Pass on the caller's size directly, but note that + * dmu_prefetch_max will effectively cap it. If there + * really is a larger sequential access pattern, perhaps + * dmu_zfetch will detect it. + */ + if (len == 0) + len = i_size_read(ip) - offset; + + dmu_prefetch(os, zp->z_id, 0, offset, len, + ZIO_PRIORITY_ASYNC_READ); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + /* ignored for now */ + break; + default: + error = -EINVAL; + break; + } + + ZFS_EXIT(zfsvfs); + + return (error); +} +#endif /* HAVE_FILE_FADVISE */ + #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) @@ -1259,6 +1318,9 @@ const struct file_operations zpl_file_operations = { .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, +#ifdef HAVE_FILE_FADVISE + .fadvise = zpl_fadvise, +#endif .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 9b32e73af..09dfb5eb1 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -89,6 +89,10 @@ tags = ['functional', 'devices'] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill'] tags = ['functional', 'events'] +[tests/functional/fadvise:Linux] +tests = ['fadvise_sequential'] +tags = ['functional', 'fadvise'] + [tests/functional/fallocate:Linux] tests = ['fallocate_prealloc', 'fallocate_zero-range'] tags = ['functional', 'fallocate'] diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 20d138253..1fd54c1dd 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -4,6 +4,7 @@ /devname2devid /dir_rd_update /draid +/file_fadvise /file_append /file_check /file_trunc diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 3c8faf5af..c19c870cf 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -128,4 +128,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/read_dos_attributes %D%/write_dos_attribu scripts_zfs_tests_bin_PROGRAMS += %D%/randfree_file %C%_randfree_file_SOURCES = %D%/file/randfree_file.c + +scripts_zfs_tests_bin_PROGRAMS += %D%/file_fadvise +%C%_file_fadvise_SOURCES = %D%/file/file_fadvise.c endif diff --git a/tests/zfs-tests/cmd/file/file_fadvise.c b/tests/zfs-tests/cmd/file/file_fadvise.c new file mode 100644 index 000000000..e1afb6d0a --- /dev/null +++ b/tests/zfs-tests/cmd/file/file_fadvise.c @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2022 by Information2 Software, Inc. All rights reserved. + */ + +#include "file_common.h" +#include +#include +#include +#include + +/* + * Call fadvise to prefetch data + */ +static const char *execname = "file_fadvise"; + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: %s -f filename -a advise \n", execname); +} + +int +main(int argc, char *argv[]) +{ + char *filename = NULL; + int advise = 0; + int fd, ch; + int err = 0; + + while ((ch = getopt(argc, argv, "a:f:")) != EOF) { + switch (ch) { + case 'a': + advise = atoll(optarg); + break; + case 'f': + filename = optarg; + break; + case '?': + (void) printf("unknown arg %c\n", optopt); + usage(); + break; + } + } + + if (!filename) { + (void) printf("Filename not specified (-f )\n"); + err++; + } + + if (advise < POSIX_FADV_NORMAL || advise > POSIX_FADV_NOREUSE) { + (void) printf("advise is invalid\n"); + err++; + } + + if (err) { + usage(); /* no return */ + return (1); + } + + if ((fd = open(filename, O_RDWR, 0666)) < 0) { + perror("open"); + return (1); + } + + posix_fadvise(fd, 0, 0, advise); + + close(fd); + + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 409856221..c05b91832 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -184,6 +184,7 @@ export ZFSTEST_FILES='badsend devname2devid dir_rd_update draid + file_fadvise file_append file_check file_trunc diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 89b2ca866..d53316643 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1370,6 +1370,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ + functional/fadvise/cleanup.ksh \ + functional/fadvise/fadvise_sequential.ksh \ + functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ diff --git a/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh index a0be1c205..23e7aa577 100755 --- a/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh @@ -76,7 +76,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL - log_mustnot eval "cat $TESTDIR/test_$type >/dev/null" + log_mustnot eval "dd if=$TESTDIR/test_$type of=/dev/null bs=$WRITESZ count=$NWRITES" cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \ awk '{print $5}') diff --git a/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh b/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh new file mode 100755 index 000000000..8b5b43a74 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/cleanup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh new file mode 100755 index 000000000..7b7d1d379 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib + +# +# DESCRIPTION: +# Test posix_fadvise. +# +# STRATEGY: +# 1. Set primarycache to metadata in order to disable prefetch +# 2. Write some data to file +# 3. get data_size field from arcstat +# 4. call file_fadvise with POSIX_FADV_SEQUENTIAL +# 5. get data_size field from arcstat again +# 6. latter data_size should be bigger than former one +# + +# NOTE: if HAVE_FILE_FADVISE is not defined former data_size +# should less or eaqul to latter one + +verify_runnable "global" + +FILE=$TESTDIR/$TESTFILE0 +BLKSZ=$(get_prop recordsize $TESTPOOL) + +function cleanup +{ + log_must zfs set primarycache=all $TESTPOOL + [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/* +} + +getstat() { + awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats +} + +log_assert "Ensure fadvise prefetch data" + +log_onexit cleanup + +log_must zfs set primarycache=metadata $TESTPOOL + +log_must file_write -o create -f $FILE -b $BLKSZ -c 1000 +sync_pool $TESTPOOL + +data_size1=$(getstat data_size) + +log_must file_fadvise -f $FILE -a 2 +sleep 10 + +data_size2=$(getstat data_size) +log_note "original data_size is $data_size1, final data_size is $data_size2" + +log_must [ $data_size1 -le $data_size2 ] + +log_pass "Ensure data could be prefetched" diff --git a/tests/zfs-tests/tests/functional/fadvise/setup.ksh b/tests/zfs-tests/tests/functional/fadvise/setup.ksh new file mode 100755 index 000000000..8ddd73307 --- /dev/null +++ b/tests/zfs-tests/tests/functional/fadvise/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Portions Copyright (c) 2022 Information2 Software, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} +default_setup_noexit $DISK +log_pass diff --git a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index e9517bad7..bd32be9a4 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -73,7 +73,7 @@ for type in "mirror" "raidz" "raidz2"; do # 4. Inject CHECKSUM ERRORS on read with a zinject error handler log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL - log_must cp $TESTFILE /dev/null + log_must dd if=$TESTFILE of=/dev/null bs=1M count=64 # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare"