backport 2.2.4 staging for better 6.8 support
Use the current ZFS 2.2.4 staging tree [0] with commit deb7a8423 ("Fix corruption caused by mmap flushing problems") on top. Additionally, include an open, but ack'd, pull request [1] that avoids a potential general protection fault due to touching a vbio after it was handed off to the kernel. [0]: https://github.com/openzfs/zfs/commits/zfs-2.2.4-staging/ [1]: https://github.com/openzfs/zfs/pull/16049 Both should mostly touch the module code. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
6c9ff9b992
commit
68be554e71
135
debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
vendored
Normal file
135
debian/patches/0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
vendored
Normal file
@ -0,0 +1,135 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob N <robn@despairlabs.com>
|
||||
Date: Thu, 21 Mar 2024 10:46:15 +1100
|
||||
Subject: [PATCH] Linux 6.8 compat: use splice_copy_file_range() for fallback
|
||||
|
||||
Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
|
||||
simple wrapper around splice_copy_file_range(). Detect that function
|
||||
directly and use it if generic_ is not available.
|
||||
|
||||
Sponsored-by: https://despairlabs.com/sponsor/
|
||||
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
|
||||
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <robn@despairlabs.com>
|
||||
Closes #15930
|
||||
Closes #15931
|
||||
(cherry picked from commit ef08a4d4065d21414d7fedccac20da6bfda4dfd0)
|
||||
---
|
||||
config/kernel-vfs-file_range.m4 | 27 +++++++++++++++++++++++++++
|
||||
config/kernel.m4 | 2 ++
|
||||
module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
|
||||
3 files changed, 43 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
|
||||
index cc96404d8..8a5cbe2ee 100644
|
||||
--- a/config/kernel-vfs-file_range.m4
|
||||
+++ b/config/kernel-vfs-file_range.m4
|
||||
@@ -16,6 +16,9 @@ dnl #
|
||||
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
||||
dnl # generic_copy_file_range() added to support it
|
||||
dnl #
|
||||
+dnl # 6.8: generic_copy_file_range() removed, replaced by
|
||||
+dnl # splice_copy_file_range()
|
||||
+dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
||||
#include <linux/fs.h>
|
||||
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
|
||||
])
|
||||
])
|
||||
|
||||
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
+ ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
|
||||
+ #include <linux/splice.h>
|
||||
+ ], [
|
||||
+ struct file *src_file __attribute__ ((unused)) = NULL;
|
||||
+ loff_t src_off __attribute__ ((unused)) = 0;
|
||||
+ struct file *dst_file __attribute__ ((unused)) = NULL;
|
||||
+ loff_t dst_off __attribute__ ((unused)) = 0;
|
||||
+ size_t len __attribute__ ((unused)) = 0;
|
||||
+ splice_copy_file_range(src_file, src_off, dst_file, dst_off,
|
||||
+ len);
|
||||
+ ])
|
||||
+])
|
||||
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
+ AC_MSG_CHECKING([whether splice_copy_file_range() is available])
|
||||
+ ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
|
||||
+ AC_MSG_RESULT(yes)
|
||||
+ AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
|
||||
+ [splice_copy_file_range() is available])
|
||||
+ ],[
|
||||
+ AC_MSG_RESULT(no)
|
||||
+ ])
|
||||
+])
|
||||
+
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
||||
#include <linux/fs.h>
|
||||
diff --git a/config/kernel.m4 b/config/kernel.m4
|
||||
index e3f864577..1d0c5a27f 100644
|
||||
--- a/config/kernel.m4
|
||||
+++ b/config/kernel.m4
|
||||
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
||||
+ ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
||||
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
||||
+ ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
||||
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
|
||||
index 3065d54fa..64728fdb1 100644
|
||||
--- a/module/os/linux/zfs/zpl_file_range.c
|
||||
+++ b/module/os/linux/zfs/zpl_file_range.c
|
||||
@@ -26,6 +26,9 @@
|
||||
#include <linux/compat.h>
|
||||
#endif
|
||||
#include <linux/fs.h>
|
||||
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
|
||||
+#include <linux/splice.h>
|
||||
+#endif
|
||||
#include <sys/file.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||
ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
|
||||
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
|
||||
/*
|
||||
* Since Linux 5.3 the filesystem driver is responsible for executing
|
||||
* an appropriate fallback, and a generic fallback function is provided.
|
||||
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||
ret == -EAGAIN)
|
||||
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
||||
dst_off, len, flags);
|
||||
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
|
||||
+ /*
|
||||
+ * Since 6.8 the fallback function is called splice_copy_file_range
|
||||
+ * and has a slightly different signature.
|
||||
+ */
|
||||
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
|
||||
+ ret == -EAGAIN)
|
||||
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
|
||||
+ dst_off, len);
|
||||
#else
|
||||
/*
|
||||
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
||||
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||
*/
|
||||
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
||||
ret = -EOPNOTSUPP;
|
||||
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
|
||||
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
|
||||
|
||||
return (ret);
|
||||
}
|
121
debian/patches/0014-linux-5.4-compat-page_size.patch
vendored
Normal file
121
debian/patches/0014-linux-5.4-compat-page_size.patch
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Mon, 13 Nov 2023 17:55:29 +1100
|
||||
Subject: [PATCH] linux 5.4 compat: page_size()
|
||||
|
||||
Before 5.4 we have to do a little math.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit df04efe321a49c650f1fbaa6fd701fa2928cbe21)
|
||||
---
|
||||
config/kernel-mm-page-size.m4 | 17 +++++++++++
|
||||
config/kernel.m4 | 2 ++
|
||||
include/os/linux/Makefile.am | 1 +
|
||||
include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
|
||||
4 files changed, 56 insertions(+)
|
||||
create mode 100644 config/kernel-mm-page-size.m4
|
||||
create mode 100644 include/os/linux/kernel/linux/mm_compat.h
|
||||
|
||||
diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
|
||||
new file mode 100644
|
||||
index 000000000..d5ebd9269
|
||||
--- /dev/null
|
||||
+++ b/config/kernel-mm-page-size.m4
|
||||
@@ -0,0 +1,17 @@
|
||||
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
+ ZFS_LINUX_TEST_SRC([page_size], [
|
||||
+ #include <linux/mm.h>
|
||||
+ ],[
|
||||
+ unsigned long s;
|
||||
+ s = page_size(NULL);
|
||||
+ ])
|
||||
+])
|
||||
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
+ AC_MSG_CHECKING([whether page_size() is available])
|
||||
+ ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
+ AC_MSG_RESULT(yes)
|
||||
+ AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
+ ],[
|
||||
+ AC_MSG_RESULT(no)
|
||||
+ ])
|
||||
+])
|
||||
diff --git a/config/kernel.m4 b/config/kernel.m4
|
||||
index 1d0c5a27f..548905ccd 100644
|
||||
--- a/config/kernel.m4
|
||||
+++ b/config/kernel.m4
|
||||
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||
+ ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SYNC_BDEV
|
||||
+ ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
|
||||
index 3830d198d..51c27132b 100644
|
||||
--- a/include/os/linux/Makefile.am
|
||||
+++ b/include/os/linux/Makefile.am
|
||||
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
|
||||
%D%/kernel/linux/compiler_compat.h \
|
||||
%D%/kernel/linux/dcache_compat.h \
|
||||
%D%/kernel/linux/kmap_compat.h \
|
||||
+ %D%/kernel/linux/mm_compat.h \
|
||||
%D%/kernel/linux/mod_compat.h \
|
||||
%D%/kernel/linux/page_compat.h \
|
||||
%D%/kernel/linux/percpu_compat.h \
|
||||
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
|
||||
new file mode 100644
|
||||
index 000000000..40056c68d
|
||||
--- /dev/null
|
||||
+++ b/include/os/linux/kernel/linux/mm_compat.h
|
||||
@@ -0,0 +1,36 @@
|
||||
+/*
|
||||
+ * CDDL HEADER START
|
||||
+ *
|
||||
+ * The contents of this file are subject to the terms of the
|
||||
+ * Common Development and Distribution License (the "License").
|
||||
+ * You may not use this file except in compliance with the License.
|
||||
+ *
|
||||
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
+ * or https://opensource.org/licenses/CDDL-1.0.
|
||||
+ * See the License for the specific language governing permissions
|
||||
+ * and limitations under the License.
|
||||
+ *
|
||||
+ * When distributing Covered Code, include this CDDL HEADER in each
|
||||
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
+ * If applicable, add the following below this CDDL HEADER, with the
|
||||
+ * fields enclosed by brackets "[]" replaced with your own identifying
|
||||
+ * information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
+ *
|
||||
+ * CDDL HEADER END
|
||||
+ */
|
||||
+
|
||||
+/*
|
||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||||
+ */
|
||||
+
|
||||
+#ifndef _ZFS_MM_COMPAT_H
|
||||
+#define _ZFS_MM_COMPAT_H
|
||||
+
|
||||
+#include <linux/mm.h>
|
||||
+
|
||||
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||
+#ifndef HAVE_MM_PAGE_SIZE
|
||||
+#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||
+#endif
|
||||
+
|
||||
+#endif /* _ZFS_MM_COMPAT_H */
|
334
debian/patches/0015-abd-add-page-iterator.patch
vendored
Normal file
334
debian/patches/0015-abd-add-page-iterator.patch
vendored
Normal file
@ -0,0 +1,334 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Mon, 11 Dec 2023 16:05:54 +1100
|
||||
Subject: [PATCH] abd: add page iterator
|
||||
|
||||
The regular ABD iterators yield data buffers, so they have to map and
|
||||
unmap pages into kernel memory. If the caller only wants to count
|
||||
chunks, or can use page pointers directly, then the map/unmap is just
|
||||
unnecessary overhead.
|
||||
|
||||
This adds adb_iterate_page_func, which yields unmapped struct page
|
||||
instead.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
|
||||
---
|
||||
include/sys/abd.h | 7 +++
|
||||
include/sys/abd_impl.h | 26 ++++++++-
|
||||
module/os/freebsd/zfs/abd_os.c | 4 +-
|
||||
module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++---
|
||||
module/zfs/abd.c | 42 +++++++++++++
|
||||
5 files changed, 169 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/include/sys/abd.h b/include/sys/abd.h
|
||||
index 750f9986c..8a2df0bca 100644
|
||||
--- a/include/sys/abd.h
|
||||
+++ b/include/sys/abd.h
|
||||
@@ -79,6 +79,9 @@ typedef struct abd {
|
||||
|
||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||||
+#if defined(__linux__) && defined(_KERNEL)
|
||||
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||
+#endif
|
||||
|
||||
extern int zfs_abd_scatter_enabled;
|
||||
|
||||
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
||||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||||
abd_iter_func2_t *, void *);
|
||||
+#if defined(__linux__) && defined(_KERNEL)
|
||||
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||
+ void *);
|
||||
+#endif
|
||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||||
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
|
||||
index 40546d4af..f88ea25e2 100644
|
||||
--- a/include/sys/abd_impl.h
|
||||
+++ b/include/sys/abd_impl.h
|
||||
@@ -21,6 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_H
|
||||
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
||||
ABDSTAT_DECR /* Decrease abdstat values */
|
||||
} abd_stats_op_t;
|
||||
|
||||
-struct scatterlist; /* forward declaration */
|
||||
+/* forward declarations */
|
||||
+struct scatterlist;
|
||||
+struct page;
|
||||
|
||||
struct abd_iter {
|
||||
/* public interface */
|
||||
- void *iter_mapaddr; /* addr corresponding to iter_pos */
|
||||
- size_t iter_mapsize; /* length of data valid at mapaddr */
|
||||
+ union {
|
||||
+ /* for abd_iter_map()/abd_iter_unmap() */
|
||||
+ struct {
|
||||
+ /* addr corresponding to iter_pos */
|
||||
+ void *iter_mapaddr;
|
||||
+ /* length of data valid at mapaddr */
|
||||
+ size_t iter_mapsize;
|
||||
+ };
|
||||
+ /* for abd_iter_page() */
|
||||
+ struct {
|
||||
+ /* current page */
|
||||
+ struct page *iter_page;
|
||||
+ /* offset of data in page */
|
||||
+ size_t iter_page_doff;
|
||||
+ /* size of data in page */
|
||||
+ size_t iter_page_dsize;
|
||||
+ };
|
||||
+ };
|
||||
|
||||
/* private */
|
||||
abd_t *iter_abd; /* ABD being iterated through */
|
||||
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
||||
void abd_iter_advance(struct abd_iter *, size_t);
|
||||
void abd_iter_map(struct abd_iter *);
|
||||
void abd_iter_unmap(struct abd_iter *);
|
||||
+void abd_iter_page(struct abd_iter *);
|
||||
|
||||
/*
|
||||
* Helper macros
|
||||
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
|
||||
index 58a37df62..3b812271f 100644
|
||||
--- a/module/os/freebsd/zfs/abd_os.c
|
||||
+++ b/module/os/freebsd/zfs/abd_os.c
|
||||
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
+ memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
- aiter->iter_pos = 0;
|
||||
- aiter->iter_mapaddr = NULL;
|
||||
- aiter->iter_mapsize = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
||||
index 24390fbbf..dae128012 100644
|
||||
--- a/module/os/linux/zfs/abd_os.c
|
||||
+++ b/module/os/linux/zfs/abd_os.c
|
||||
@@ -21,6 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
@@ -59,6 +60,7 @@
|
||||
#include <sys/zfs_znode.h>
|
||||
#ifdef _KERNEL
|
||||
#include <linux/kmap_compat.h>
|
||||
+#include <linux/mm_compat.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#endif
|
||||
|
||||
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
+ memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
- aiter->iter_mapaddr = NULL;
|
||||
- aiter->iter_mapsize = 0;
|
||||
- aiter->iter_pos = 0;
|
||||
- if (abd_is_linear(abd)) {
|
||||
- aiter->iter_offset = 0;
|
||||
- aiter->iter_sg = NULL;
|
||||
- } else {
|
||||
+ if (!abd_is_linear(abd)) {
|
||||
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
||||
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
||||
}
|
||||
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||
boolean_t
|
||||
abd_iter_at_end(struct abd_iter *aiter)
|
||||
{
|
||||
+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||||
}
|
||||
|
||||
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
||||
void
|
||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||||
{
|
||||
+ /*
|
||||
+ * Ensure that last chunk is not in use. abd_iterate_*() must clear
|
||||
+ * this state (directly or abd_iter_unmap()) before advancing.
|
||||
+ */
|
||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||
ASSERT0(aiter->iter_mapsize);
|
||||
+ ASSERT3P(aiter->iter_page, ==, NULL);
|
||||
+ ASSERT0(aiter->iter_page_doff);
|
||||
+ ASSERT0(aiter->iter_page_dsize);
|
||||
|
||||
/* There's nothing left to advance to, so do nothing */
|
||||
if (abd_iter_at_end(aiter))
|
||||
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
+/*
|
||||
+ * Yield the next page struct and data offset and size within it, without
|
||||
+ * mapping it into the address space.
|
||||
+ */
|
||||
+void
|
||||
+abd_iter_page(struct abd_iter *aiter)
|
||||
+{
|
||||
+ if (abd_iter_at_end(aiter)) {
|
||||
+ aiter->iter_page = NULL;
|
||||
+ aiter->iter_page_doff = 0;
|
||||
+ aiter->iter_page_dsize = 0;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ struct page *page;
|
||||
+ size_t doff, dsize;
|
||||
+
|
||||
+ if (abd_is_linear(aiter->iter_abd)) {
|
||||
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||||
+
|
||||
+ /* memory address at iter_pos */
|
||||
+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||||
+
|
||||
+ /* struct page for address */
|
||||
+ page = is_vmalloc_addr(paddr) ?
|
||||
+ vmalloc_to_page(paddr) : virt_to_page(paddr);
|
||||
+
|
||||
+ /* offset of address within the page */
|
||||
+ doff = offset_in_page(paddr);
|
||||
+
|
||||
+ /* total data remaining in abd from this position */
|
||||
+ dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||||
+ } else {
|
||||
+ ASSERT(!abd_is_gang(aiter->iter_abd));
|
||||
+
|
||||
+ /* current scatter page */
|
||||
+ page = sg_page(aiter->iter_sg);
|
||||
+
|
||||
+ /* position within page */
|
||||
+ doff = aiter->iter_offset;
|
||||
+
|
||||
+ /* remaining data in scatterlist */
|
||||
+ dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||||
+ aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
+ }
|
||||
+ ASSERT(page);
|
||||
+
|
||||
+ if (PageTail(page)) {
|
||||
+ /*
|
||||
+ * This page is part of a "compound page", which is a group of
|
||||
+ * pages that can be referenced from a single struct page *.
|
||||
+ * Its organised as a "head" page, followed by a series of
|
||||
+ * "tail" pages.
|
||||
+ *
|
||||
+ * In OpenZFS, compound pages are allocated using the
|
||||
+ * __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||||
+ * vmalloc slabs (ie >16K allocations). So a great many of the
|
||||
+ * IO buffers we get are going to be of this type.
|
||||
+ *
|
||||
+ * The tail pages are just regular PAGE_SIZE pages, and can be
|
||||
+ * safely used as-is. However, the head page has length
|
||||
+ * covering itself and all the tail pages. If this ABD chunk
|
||||
+ * spans multiple pages, then we can use the head page and a
|
||||
+ * >PAGE_SIZE length, which is far more efficient.
|
||||
+ *
|
||||
+ * To do this, we need to adjust the offset to be counted from
|
||||
+ * the head page. struct page for compound pages are stored
|
||||
+ * contiguously, so we can just adjust by a simple offset.
|
||||
+ */
|
||||
+ struct page *head = compound_head(page);
|
||||
+ doff += ((page - head) * PAGESIZE);
|
||||
+ page = head;
|
||||
+ }
|
||||
+
|
||||
+ /* final page and position within it */
|
||||
+ aiter->iter_page = page;
|
||||
+ aiter->iter_page_doff = doff;
|
||||
+
|
||||
+ /* amount of data in the chunk, up to the end of the page */
|
||||
+ aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* bio_nr_pages for ABD.
|
||||
* @off is the offset in @abd
|
||||
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
||||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||||
"Maximum order allocation used for a scatter ABD.");
|
||||
-#endif
|
||||
+
|
||||
+#endif /* _KERNEL */
|
||||
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
|
||||
index d982f201c..3388e2357 100644
|
||||
--- a/module/zfs/abd.c
|
||||
+++ b/module/zfs/abd.c
|
||||
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
||||
return (ret);
|
||||
}
|
||||
|
||||
+#if defined(__linux__) && defined(_KERNEL)
|
||||
+int
|
||||
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
||||
+ abd_iter_page_func_t *func, void *private)
|
||||
+{
|
||||
+ struct abd_iter aiter;
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ if (size == 0)
|
||||
+ return (0);
|
||||
+
|
||||
+ abd_verify(abd);
|
||||
+ ASSERT3U(off + size, <=, abd->abd_size);
|
||||
+
|
||||
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||
+
|
||||
+ while (size > 0) {
|
||||
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
|
||||
+
|
||||
+ abd_iter_page(&aiter);
|
||||
+
|
||||
+ size_t len = MIN(aiter.iter_page_dsize, size);
|
||||
+ ASSERT3U(len, >, 0);
|
||||
+
|
||||
+ ret = func(aiter.iter_page, aiter.iter_page_doff,
|
||||
+ len, private);
|
||||
+
|
||||
+ aiter.iter_page = NULL;
|
||||
+ aiter.iter_page_doff = 0;
|
||||
+ aiter.iter_page_dsize = 0;
|
||||
+
|
||||
+ if (ret != 0)
|
||||
+ break;
|
||||
+
|
||||
+ size -= len;
|
||||
+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
||||
+ }
|
||||
+
|
||||
+ return (ret);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
struct buf_arg {
|
||||
void *arg_buf;
|
||||
};
|
349
debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
vendored
Normal file
349
debian/patches/0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
vendored
Normal file
@ -0,0 +1,349 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 9 Jan 2024 12:12:56 +1100
|
||||
Subject: [PATCH] vdev_disk: rename existing functions to vdev_classic_*
|
||||
|
||||
This is just renaming the existing functions we're about to replace and
|
||||
grouping them together to make the next commits easier to follow.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4)
|
||||
---
|
||||
include/sys/abd.h | 2 +
|
||||
module/os/linux/zfs/abd_os.c | 5 +
|
||||
module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
|
||||
3 files changed, 120 insertions(+), 102 deletions(-)
|
||||
|
||||
diff --git a/include/sys/abd.h b/include/sys/abd.h
|
||||
index 8a2df0bca..bee38b831 100644
|
||||
--- a/include/sys/abd.h
|
||||
+++ b/include/sys/abd.h
|
||||
@@ -220,6 +220,8 @@ void abd_fini(void);
|
||||
|
||||
/*
|
||||
* Linux ABD bio functions
|
||||
+ * Note: these are only needed to support vdev_classic. See comment in
|
||||
+ * vdev_disk.c.
|
||||
*/
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
||||
index dae128012..3fe01c0b7 100644
|
||||
--- a/module/os/linux/zfs/abd_os.c
|
||||
+++ b/module/os/linux/zfs/abd_os.c
|
||||
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
|
||||
+ * vdev_disk.c.
|
||||
+ */
|
||||
+
|
||||
/*
|
||||
* bio_nr_pages for ABD.
|
||||
* @off is the offset in @abd
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index b0bda5fa2..957619b87 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
|
||||
*/
|
||||
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
||||
|
||||
-/*
|
||||
- * Virtual device vector for disks.
|
||||
- */
|
||||
-typedef struct dio_request {
|
||||
- zio_t *dr_zio; /* Parent ZIO */
|
||||
- atomic_t dr_ref; /* References */
|
||||
- int dr_error; /* Bio error */
|
||||
- int dr_bio_count; /* Count of bio's */
|
||||
- struct bio *dr_bio[]; /* Attached bio's */
|
||||
-} dio_request_t;
|
||||
-
|
||||
/*
|
||||
* BIO request failfast mask.
|
||||
*/
|
||||
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
|
||||
v->vdev_tsd = NULL;
|
||||
}
|
||||
|
||||
-static dio_request_t *
|
||||
-vdev_disk_dio_alloc(int bio_count)
|
||||
-{
|
||||
- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
- sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
- atomic_set(&dr->dr_ref, 0);
|
||||
- dr->dr_bio_count = bio_count;
|
||||
- dr->dr_error = 0;
|
||||
-
|
||||
- for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
- dr->dr_bio[i] = NULL;
|
||||
-
|
||||
- return (dr);
|
||||
-}
|
||||
-
|
||||
-static void
|
||||
-vdev_disk_dio_free(dio_request_t *dr)
|
||||
-{
|
||||
- int i;
|
||||
-
|
||||
- for (i = 0; i < dr->dr_bio_count; i++)
|
||||
- if (dr->dr_bio[i])
|
||||
- bio_put(dr->dr_bio[i]);
|
||||
-
|
||||
- kmem_free(dr, sizeof (dio_request_t) +
|
||||
- sizeof (struct bio *) * dr->dr_bio_count);
|
||||
-}
|
||||
-
|
||||
-static void
|
||||
-vdev_disk_dio_get(dio_request_t *dr)
|
||||
-{
|
||||
- atomic_inc(&dr->dr_ref);
|
||||
-}
|
||||
-
|
||||
-static void
|
||||
-vdev_disk_dio_put(dio_request_t *dr)
|
||||
-{
|
||||
- int rc = atomic_dec_return(&dr->dr_ref);
|
||||
-
|
||||
- /*
|
||||
- * Free the dio_request when the last reference is dropped and
|
||||
- * ensure zio_interpret is called only once with the correct zio
|
||||
- */
|
||||
- if (rc == 0) {
|
||||
- zio_t *zio = dr->dr_zio;
|
||||
- int error = dr->dr_error;
|
||||
-
|
||||
- vdev_disk_dio_free(dr);
|
||||
-
|
||||
- if (zio) {
|
||||
- zio->io_error = error;
|
||||
- ASSERT3S(zio->io_error, >=, 0);
|
||||
- if (zio->io_error)
|
||||
- vdev_disk_error(zio);
|
||||
-
|
||||
- zio_delay_interrupt(zio);
|
||||
- }
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
||||
-{
|
||||
- dio_request_t *dr = bio->bi_private;
|
||||
-
|
||||
- if (dr->dr_error == 0) {
|
||||
-#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
- dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
-#else
|
||||
- if (error)
|
||||
- dr->dr_error = -(error);
|
||||
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
- dr->dr_error = EIO;
|
||||
-#endif
|
||||
- }
|
||||
-
|
||||
- /* Drop reference acquired by __vdev_disk_physio */
|
||||
- vdev_disk_dio_put(dr);
|
||||
-}
|
||||
-
|
||||
static inline void
|
||||
vdev_submit_bio_impl(struct bio *bio)
|
||||
{
|
||||
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
||||
return (bio);
|
||||
}
|
||||
|
||||
+/* ========== */
|
||||
+
|
||||
+/*
|
||||
+ * This is the classic, battle-tested BIO submission code.
|
||||
+ *
|
||||
+ * These functions have been renamed to vdev_classic_* to make it clear what
|
||||
+ * they belong to, but their implementations are unchanged.
|
||||
+ */
|
||||
+
|
||||
+/*
|
||||
+ * Virtual device vector for disks.
|
||||
+ */
|
||||
+typedef struct dio_request {
|
||||
+ zio_t *dr_zio; /* Parent ZIO */
|
||||
+ atomic_t dr_ref; /* References */
|
||||
+ int dr_error; /* Bio error */
|
||||
+ int dr_bio_count; /* Count of bio's */
|
||||
+ struct bio *dr_bio[]; /* Attached bio's */
|
||||
+} dio_request_t;
|
||||
+
|
||||
+static dio_request_t *
|
||||
+vdev_classic_dio_alloc(int bio_count)
|
||||
+{
|
||||
+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
+ sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
+ atomic_set(&dr->dr_ref, 0);
|
||||
+ dr->dr_bio_count = bio_count;
|
||||
+ dr->dr_error = 0;
|
||||
+
|
||||
+ for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
+ dr->dr_bio[i] = NULL;
|
||||
+
|
||||
+ return (dr);
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vdev_classic_dio_free(dio_request_t *dr)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < dr->dr_bio_count; i++)
|
||||
+ if (dr->dr_bio[i])
|
||||
+ bio_put(dr->dr_bio[i]);
|
||||
+
|
||||
+ kmem_free(dr, sizeof (dio_request_t) +
|
||||
+ sizeof (struct bio *) * dr->dr_bio_count);
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vdev_classic_dio_get(dio_request_t *dr)
|
||||
+{
|
||||
+ atomic_inc(&dr->dr_ref);
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vdev_classic_dio_put(dio_request_t *dr)
|
||||
+{
|
||||
+ int rc = atomic_dec_return(&dr->dr_ref);
|
||||
+
|
||||
+ /*
|
||||
+ * Free the dio_request when the last reference is dropped and
|
||||
+ * ensure zio_interpret is called only once with the correct zio
|
||||
+ */
|
||||
+ if (rc == 0) {
|
||||
+ zio_t *zio = dr->dr_zio;
|
||||
+ int error = dr->dr_error;
|
||||
+
|
||||
+ vdev_classic_dio_free(dr);
|
||||
+
|
||||
+ if (zio) {
|
||||
+ zio->io_error = error;
|
||||
+ ASSERT3S(zio->io_error, >=, 0);
|
||||
+ if (zio->io_error)
|
||||
+ vdev_disk_error(zio);
|
||||
+
|
||||
+ zio_delay_interrupt(zio);
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
|
||||
+{
|
||||
+ dio_request_t *dr = bio->bi_private;
|
||||
+
|
||||
+ if (dr->dr_error == 0) {
|
||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
+ dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
+#else
|
||||
+ if (error)
|
||||
+ dr->dr_error = -(error);
|
||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
+ dr->dr_error = EIO;
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+ /* Drop reference acquired by vdev_classic_physio */
|
||||
+ vdev_classic_dio_put(dr);
|
||||
+}
|
||||
+
|
||||
static inline unsigned int
|
||||
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
{
|
||||
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
||||
bio_size, abd_offset);
|
||||
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
}
|
||||
|
||||
static int
|
||||
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
||||
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
|
||||
size_t io_size, uint64_t io_offset, int rw, int flags)
|
||||
{
|
||||
dio_request_t *dr;
|
||||
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
||||
}
|
||||
|
||||
retry:
|
||||
- dr = vdev_disk_dio_alloc(bio_count);
|
||||
+ dr = vdev_classic_dio_alloc(bio_count);
|
||||
|
||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
zio->io_vd->vdev_failfast == B_TRUE) {
|
||||
@@ -771,23 +780,23 @@ retry:
|
||||
* this should be rare - see the comment above.
|
||||
*/
|
||||
if (dr->dr_bio_count == i) {
|
||||
- vdev_disk_dio_free(dr);
|
||||
+ vdev_classic_dio_free(dr);
|
||||
bio_count *= 2;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
- nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
||||
+ nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
|
||||
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
||||
if (unlikely(dr->dr_bio[i] == NULL)) {
|
||||
- vdev_disk_dio_free(dr);
|
||||
+ vdev_classic_dio_free(dr);
|
||||
return (SET_ERROR(ENOMEM));
|
||||
}
|
||||
|
||||
- /* Matching put called by vdev_disk_physio_completion */
|
||||
- vdev_disk_dio_get(dr);
|
||||
+ /* Matching put called by vdev_classic_physio_completion */
|
||||
+ vdev_classic_dio_get(dr);
|
||||
|
||||
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
||||
- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
||||
+ dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
|
||||
dr->dr_bio[i]->bi_private = dr;
|
||||
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
||||
|
||||
@@ -801,7 +810,7 @@ retry:
|
||||
}
|
||||
|
||||
/* Extra reference to protect dio_request during vdev_submit_bio */
|
||||
- vdev_disk_dio_get(dr);
|
||||
+ vdev_classic_dio_get(dr);
|
||||
|
||||
if (dr->dr_bio_count > 1)
|
||||
blk_start_plug(&plug);
|
||||
@@ -815,11 +824,13 @@ retry:
|
||||
if (dr->dr_bio_count > 1)
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
- vdev_disk_dio_put(dr);
|
||||
+ vdev_classic_dio_put(dr);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
+/* ========== */
|
||||
+
|
||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
||||
{
|
||||
zio_t *zio = bio->bi_private;
|
||||
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||
}
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
- error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
||||
+ error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
|
||||
zio->io_size, zio->io_offset, rw, 0);
|
||||
rw_exit(&vd->vd_lock);
|
||||
|
111
debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
vendored
Normal file
111
debian/patches/0017-vdev_disk-reorganise-vdev_disk_io_start.patch
vendored
Normal file
@ -0,0 +1,111 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 9 Jan 2024 12:23:30 +1100
|
||||
Subject: [PATCH] vdev_disk: reorganise vdev_disk_io_start
|
||||
|
||||
Light reshuffle to make it a bit more linear to read and get rid of a
|
||||
bunch of args that aren't needed in all cases.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit 867178ae1db28e73051c8a7ce662f2f2f81cd8e6)
|
||||
---
|
||||
module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
|
||||
1 file changed, 31 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index 957619b87..51e7cef2f 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
}
|
||||
|
||||
static int
|
||||
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
|
||||
- size_t io_size, uint64_t io_offset, int rw, int flags)
|
||||
+vdev_classic_physio(zio_t *zio)
|
||||
{
|
||||
+ vdev_t *v = zio->io_vd;
|
||||
+ vdev_disk_t *vd = v->vdev_tsd;
|
||||
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
+ size_t io_size = zio->io_size;
|
||||
+ uint64_t io_offset = zio->io_offset;
|
||||
+ int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
|
||||
+ int flags = 0;
|
||||
+
|
||||
dio_request_t *dr;
|
||||
uint64_t abd_offset;
|
||||
uint64_t bio_offset;
|
||||
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
- int rw, error;
|
||||
+ int error;
|
||||
|
||||
/*
|
||||
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
||||
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
|
||||
rw_exit(&vd->vd_lock);
|
||||
zio_execute(zio);
|
||||
return;
|
||||
- case ZIO_TYPE_WRITE:
|
||||
- rw = WRITE;
|
||||
- break;
|
||||
-
|
||||
- case ZIO_TYPE_READ:
|
||||
- rw = READ;
|
||||
- break;
|
||||
|
||||
case ZIO_TYPE_TRIM:
|
||||
zio->io_error = vdev_disk_io_trim(zio);
|
||||
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
|
||||
#endif
|
||||
return;
|
||||
|
||||
- default:
|
||||
+ case ZIO_TYPE_READ:
|
||||
+ case ZIO_TYPE_WRITE:
|
||||
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
+ error = vdev_classic_physio(zio);
|
||||
rw_exit(&vd->vd_lock);
|
||||
- zio->io_error = SET_ERROR(ENOTSUP);
|
||||
- zio_interrupt(zio);
|
||||
+ if (error) {
|
||||
+ zio->io_error = error;
|
||||
+ zio_interrupt(zio);
|
||||
+ }
|
||||
return;
|
||||
- }
|
||||
|
||||
- zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
- error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
|
||||
- zio->io_size, zio->io_offset, rw, 0);
|
||||
- rw_exit(&vd->vd_lock);
|
||||
+ default:
|
||||
+ /*
|
||||
+ * Getting here means our parent vdev has made a very strange
|
||||
+ * request of us, and shouldn't happen. Assert here to force a
|
||||
+ * crash in dev builds, but in production return the IO
|
||||
+ * unhandled. The pool will likely suspend anyway but that's
|
||||
+ * nicer than crashing the kernel.
|
||||
+ */
|
||||
+ ASSERT3S(zio->io_type, ==, -1);
|
||||
|
||||
- if (error) {
|
||||
- zio->io_error = error;
|
||||
+ rw_exit(&vd->vd_lock);
|
||||
+ zio->io_error = SET_ERROR(ENOTSUP);
|
||||
zio_interrupt(zio);
|
||||
return;
|
||||
}
|
||||
+
|
||||
+ __builtin_unreachable();
|
||||
}
|
||||
|
||||
static void
|
69
debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
vendored
Normal file
69
debian/patches/0018-vdev_disk-make-read-write-IO-function-configurable.patch
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 9 Jan 2024 12:29:19 +1100
|
||||
Subject: [PATCH] vdev_disk: make read/write IO function configurable
|
||||
|
||||
This is just setting up for the next couple of commits, which will add a
|
||||
new IO function and a parameter to select it.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit c4a13ba483f08a81aa47479d2f763a470d95b2b0)
|
||||
---
|
||||
module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
|
||||
1 file changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index 51e7cef2f..de4dba72f 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
|
||||
#endif
|
||||
}
|
||||
|
||||
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
|
||||
+
|
||||
static void
|
||||
vdev_disk_io_start(zio_t *zio)
|
||||
{
|
||||
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
|
||||
case ZIO_TYPE_READ:
|
||||
case ZIO_TYPE_WRITE:
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
- error = vdev_classic_physio(zio);
|
||||
+ error = vdev_disk_io_rw_fn(zio);
|
||||
rw_exit(&vd->vd_lock);
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
|
||||
/* XXX: Implement me as a vnode rele for the device */
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * At first use vdev use, set the submission function from the default value if
|
||||
+ * it hasn't been set already.
|
||||
+ */
|
||||
+static int
|
||||
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||
+{
|
||||
+ (void) spa;
|
||||
+ (void) nv;
|
||||
+ (void) tsd;
|
||||
+
|
||||
+ if (vdev_disk_io_rw_fn == NULL)
|
||||
+ vdev_disk_io_rw_fn = vdev_classic_physio;
|
||||
+
|
||||
+ return (0);
|
||||
+}
|
||||
+
|
||||
vdev_ops_t vdev_disk_ops = {
|
||||
- .vdev_op_init = NULL,
|
||||
+ .vdev_op_init = vdev_disk_init,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_disk_open,
|
||||
.vdev_op_close = vdev_disk_close,
|
671
debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
vendored
Normal file
671
debian/patches/0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
vendored
Normal file
@ -0,0 +1,671 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 18 Jul 2023 11:11:29 +1000
|
||||
Subject: [PATCH] vdev_disk: rewrite BIO filling machinery to avoid split pages
|
||||
|
||||
This commit tackles a number of issues in the way BIOs (`struct bio`)
|
||||
are constructed for submission to the Linux block layer.
|
||||
|
||||
The kernel has a hard upper limit on the number of pages/segments that
|
||||
can be added to a BIO, as well as a separate limit for each device
|
||||
(related to its queue depth and other scheduling characteristics).
|
||||
|
||||
ZFS counts the number of memory pages in the request ABD
|
||||
(`abd_nr_pages_off()`, and then uses that as the number of segments to
|
||||
put into the BIO, up to the hard upper limit. If it requires more than
|
||||
the limit, it will create multiple BIOs.
|
||||
|
||||
Leaving aside the fact that page count method is wrong (see below), not
|
||||
limiting to the device segment max means that the device driver will
|
||||
need to split the BIO in half. This is alone is not necessarily a
|
||||
problem, but it interacts with another issue to cause a much larger
|
||||
problem.
|
||||
|
||||
The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
|
||||
`struct page` pointer, and offset+len within it. `struct page` can
|
||||
represent a run of contiguous memory pages (known as a "compound page").
|
||||
In can be of arbitrary length.
|
||||
|
||||
The ZFS functions that count ABD pages and load them into the BIO
|
||||
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
|
||||
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
|
||||
page` is for multiple pages. In this case, it will load the same `struct
|
||||
page` into the BIO multiple times, with the offset adjusted each time.
|
||||
|
||||
With a sufficiently large ABD, this can easily lead to the BIO being
|
||||
entirely filled much earlier than it could have been. This is also
|
||||
further contributes to the problem caused by the incorrect segment limit
|
||||
calculation, as its much easier to go past the device limit, and so
|
||||
require a split.
|
||||
|
||||
Again, this is not a problem on its own.
|
||||
|
||||
The logic for "never submit more than `PAGE_SIZE`" is actually a little
|
||||
more subtle. It will actually never submit a buffer that crosses a 4K
|
||||
page boundary.
|
||||
|
||||
In practice, this is fine, as most ABDs are scattered, that is a list of
|
||||
complete 4K pages, and so are loaded in as such.
|
||||
|
||||
Linear ABDs are typically allocated from slabs, and for small sizes they
|
||||
are frequently not aligned to page boundaries. For example, a 12K
|
||||
allocation can span four pages, eg:
|
||||
|
||||
-- 4K -- -- 4K -- -- 4K -- -- 4K --
|
||||
| | | | |
|
||||
:## ######## ######## ######: [1K, 4K, 4K, 3K]
|
||||
|
||||
Such an allocation would be loaded into a BIO as you see:
|
||||
|
||||
[1K, 4K, 4K, 3K]
|
||||
|
||||
This tends not to be a problem in practice, because even if the BIO were
|
||||
filled and needed to be split, each half would still have either a start
|
||||
or end aligned to the logical block size of the device (assuming 4K at
|
||||
least).
|
||||
|
||||
---
|
||||
|
||||
In ideal circumstances, these shortcomings don't cause any particular
|
||||
problems. Its when they start to interact with other ZFS features that
|
||||
things get interesting.
|
||||
|
||||
Aggregation will create a "gang" ABD, which is simply a list of other
|
||||
ABDs. Iterating over a gang ABD is just iterating over each ABD within
|
||||
it in turn.
|
||||
|
||||
Because the segments are simply loaded in order, we can end up with
|
||||
uneven segments either side of the "gap" between the two ABDs. For
|
||||
example, two 12K ABDs might be aggregated and then loaded as:
|
||||
|
||||
[1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]
|
||||
|
||||
Should a split occur, each individual BIO can end up either having an
|
||||
start or end offset that is not aligned to the logical block size, which
|
||||
some drivers (eg SCSI) will reject. However, this tends not to happen
|
||||
because the default aggregation limit usually keeps the BIO small enough
|
||||
to not require more than one split, and most pages are actually full 4K
|
||||
pages, so hitting an uneven gap is very rare anyway.
|
||||
|
||||
If the pool is under particular memory pressure, then an IO can be
|
||||
broken down into a "gang block", a 512-byte block composed of a header
|
||||
and up to three block pointers. Each points to a fragment of the
|
||||
original write, or in turn, another gang block, breaking the original
|
||||
data up over and over until space can be found in the pool for each of
|
||||
them.
|
||||
|
||||
Each gang header is a separate 512-byte memory allocation from a slab,
|
||||
that needs to be written down to disk. When the gang header is added to
|
||||
the BIO, its a single 512-byte segment.
|
||||
|
||||
Pulling all this together, consider a large aggregated write of gang
|
||||
blocks. This results a BIO containing lots of 512-byte segments. Given
|
||||
our tendency to overfill the BIO, a split is likely, and most possible
|
||||
split points will yield a pair of BIOs that are misaligned. Drivers that
|
||||
care, like the SCSI driver, will reject them.
|
||||
|
||||
---
|
||||
|
||||
This commit is a substantial refactor and rewrite of much of `vdev_disk`
|
||||
to sort all this out.
|
||||
|
||||
`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
|
||||
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
|
||||
override this, to assist with testing.
|
||||
|
||||
We scan the ABD up front to count the number of pages within it, and to
|
||||
confirm that if we submitted all those pages to one or more BIOs, it
|
||||
could be split at any point with creating a misaligned BIO. If the
|
||||
pages in the BIO are not usable (as in any of the above situations), the
|
||||
ABD is linearised, and then checked again. This is the same technique
|
||||
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
|
||||
and allocator quirks.
|
||||
|
||||
`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
|
||||
idea is simply that it can hold all the state needed to create, submit
|
||||
and return multiple BIOs, including all the refcounts, the ABD copy if
|
||||
it was needed, and so on. Apart from what I hope is a clearer interface,
|
||||
the major difference is that because we know how many BIOs we'll need up
|
||||
front, we don't need the old overflow logic that would grow the BIO
|
||||
array, throw away all the old work and restart. We can get it right from
|
||||
the start.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8)
|
||||
---
|
||||
include/os/linux/kernel/linux/mod_compat.h | 1 +
|
||||
man/man4/zfs.4 | 10 +-
|
||||
module/os/linux/zfs/vdev_disk.c | 439 ++++++++++++++++++++-
|
||||
3 files changed, 447 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
|
||||
index 8e20a9613..039865b70 100644
|
||||
--- a/include/os/linux/kernel/linux/mod_compat.h
|
||||
+++ b/include/os/linux/kernel/linux/mod_compat.h
|
||||
@@ -68,6 +68,7 @@ enum scope_prefix_types {
|
||||
zfs_trim,
|
||||
zfs_txg,
|
||||
zfs_vdev,
|
||||
+ zfs_vdev_disk,
|
||||
zfs_vdev_file,
|
||||
zfs_vdev_mirror,
|
||||
zfs_vnops,
|
||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
||||
index 352990e02..b5679f2f0 100644
|
||||
--- a/man/man4/zfs.4
|
||||
+++ b/man/man4/zfs.4
|
||||
@@ -2,6 +2,7 @@
|
||||
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2019 Datto Inc.
|
||||
+.\" Copyright (c) 2023, 2024 Klara, Inc.
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
.\" in compliance with the License. You can obtain a copy of the license at
|
||||
@@ -15,7 +16,7 @@
|
||||
.\" own identifying information:
|
||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
||||
.\"
|
||||
-.Dd July 21, 2023
|
||||
+.Dd January 9, 2024
|
||||
.Dt ZFS 4
|
||||
.Os
|
||||
.
|
||||
@@ -1345,6 +1346,13 @@ _
|
||||
4 Driver No driver retries on driver errors.
|
||||
.TE
|
||||
.
|
||||
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
|
||||
+Maximum number of segments to add to a BIO (min 4).
|
||||
+If this is higher than the maximum allowed by the device queue or the kernel
|
||||
+itself, it will be clamped.
|
||||
+Setting it to zero will cause the kernel's ideal size to be used.
|
||||
+This parameter only applies on Linux.
|
||||
+.
|
||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
||||
Time before expiring
|
||||
.Pa .zfs/snapshot .
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index de4dba72f..0ccb9ad96 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -24,6 +24,7 @@
|
||||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||
* LLNL-CODE-403049.
|
||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
|
||||
krwlock_t vd_lock;
|
||||
} vdev_disk_t;
|
||||
|
||||
+/*
|
||||
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
|
||||
+ * the maximum allowed by the device queue or the kernel itself, it will be
|
||||
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
|
||||
+ */
|
||||
+uint_t zfs_vdev_disk_max_segs = 0;
|
||||
+
|
||||
/*
|
||||
* Unique identifier for the exclusive vdev holder.
|
||||
*/
|
||||
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
||||
return (bio);
|
||||
}
|
||||
|
||||
+static inline uint_t
|
||||
+vdev_bio_max_segs(struct block_device *bdev)
|
||||
+{
|
||||
+ /*
|
||||
+ * Smallest of the device max segs and the tuneable max segs. Minimum
|
||||
+ * 4, so there's room to finish split pages if they come up.
|
||||
+ */
|
||||
+ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
|
||||
+ const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
|
||||
+ MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
|
||||
+ const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
|
||||
+
|
||||
+#ifdef HAVE_BIO_MAX_SEGS
|
||||
+ return (bio_max_segs(max_segs));
|
||||
+#else
|
||||
+ return (MIN(max_segs, BIO_MAX_PAGES));
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+static inline uint_t
|
||||
+vdev_bio_max_bytes(struct block_device *bdev)
|
||||
+{
|
||||
+ return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
|
||||
+}
|
||||
+
|
||||
+
|
||||
+/*
|
||||
+ * Virtual block IO object (VBIO)
|
||||
+ *
|
||||
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
|
||||
+ * they can hold. Depending on how they're allocated and structured, a large
|
||||
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
|
||||
+ * all have to complete before we can return the completed ZIO back to ZFS.
|
||||
+ *
|
||||
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
|
||||
+ * translate a ZIO down into the kernel block layer and back again.
|
||||
+ *
|
||||
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
|
||||
+ * (flush/trim) don't need multiple BIOs and so can just make the call
|
||||
+ * directly.
|
||||
+ */
|
||||
+typedef struct {
|
||||
+ zio_t *vbio_zio; /* parent zio */
|
||||
+
|
||||
+ struct block_device *vbio_bdev; /* blockdev to submit bios to */
|
||||
+
|
||||
+ abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
||||
+
|
||||
+ atomic_t vbio_ref; /* bio refcount */
|
||||
+ int vbio_error; /* error from failed bio */
|
||||
+
|
||||
+ uint_t vbio_max_segs; /* max segs per bio */
|
||||
+
|
||||
+ uint_t vbio_max_bytes; /* max bytes per bio */
|
||||
+ uint_t vbio_lbs_mask; /* logical block size mask */
|
||||
+
|
||||
+ uint64_t vbio_offset; /* start offset of next bio */
|
||||
+
|
||||
+ struct bio *vbio_bio; /* pointer to the current bio */
|
||||
+ struct bio *vbio_bios; /* list of all bios */
|
||||
+} vbio_t;
|
||||
+
|
||||
+static vbio_t *
|
||||
+vbio_alloc(zio_t *zio, struct block_device *bdev)
|
||||
+{
|
||||
+ vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
||||
+
|
||||
+ vbio->vbio_zio = zio;
|
||||
+ vbio->vbio_bdev = bdev;
|
||||
+ atomic_set(&vbio->vbio_ref, 0);
|
||||
+ vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
||||
+ vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
||||
+ vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
||||
+ vbio->vbio_offset = zio->io_offset;
|
||||
+
|
||||
+ return (vbio);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||
+{
|
||||
+ struct bio *bio;
|
||||
+ uint_t ssize;
|
||||
+
|
||||
+ while (size > 0) {
|
||||
+ bio = vbio->vbio_bio;
|
||||
+ if (bio == NULL) {
|
||||
+ /* New BIO, allocate and set up */
|
||||
+ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
||||
+ vbio->vbio_max_segs);
|
||||
+ if (unlikely(bio == NULL))
|
||||
+ return (SET_ERROR(ENOMEM));
|
||||
+ BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
||||
+
|
||||
+ bio->bi_next = vbio->vbio_bios;
|
||||
+ vbio->vbio_bios = vbio->vbio_bio = bio;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Only load as much of the current page data as will fit in
|
||||
+ * the space left in the BIO, respecting lbs alignment. Older
|
||||
+ * kernels will error if we try to overfill the BIO, while
|
||||
+ * newer ones will accept it and split the BIO. This ensures
|
||||
+ * everything works on older kernels, and avoids an additional
|
||||
+ * overhead on the new.
|
||||
+ */
|
||||
+ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
|
||||
+ vbio->vbio_lbs_mask);
|
||||
+ if (ssize > 0 &&
|
||||
+ bio_add_page(bio, page, ssize, offset) == ssize) {
|
||||
+ /* Accepted, adjust and load any remaining. */
|
||||
+ size -= ssize;
|
||||
+ offset += ssize;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* No room, set up for a new BIO and loop */
|
||||
+ vbio->vbio_offset += BIO_BI_SIZE(bio);
|
||||
+
|
||||
+ /* Signal new BIO allocation wanted */
|
||||
+ vbio->vbio_bio = NULL;
|
||||
+ }
|
||||
+
|
||||
+ return (0);
|
||||
+}
|
||||
+
|
||||
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
|
||||
+static void vbio_put(vbio_t *vbio);
|
||||
+
|
||||
+static void
|
||||
+vbio_submit(vbio_t *vbio, int flags)
|
||||
+{
|
||||
+ ASSERT(vbio->vbio_bios);
|
||||
+ struct bio *bio = vbio->vbio_bios;
|
||||
+ vbio->vbio_bio = vbio->vbio_bios = NULL;
|
||||
+
|
||||
+ /*
|
||||
+ * We take a reference for each BIO as we submit it, plus one to
|
||||
+ * protect us from BIOs completing before we're done submitting them
|
||||
+ * all, causing vbio_put() to free vbio out from under us and/or the
|
||||
+ * zio to be returned before all its IO has completed.
|
||||
+ */
|
||||
+ atomic_set(&vbio->vbio_ref, 1);
|
||||
+
|
||||
+ /*
|
||||
+ * If we're submitting more than one BIO, inform the block layer so
|
||||
+ * it can batch them if it wants.
|
||||
+ */
|
||||
+ struct blk_plug plug;
|
||||
+ boolean_t do_plug = (bio->bi_next != NULL);
|
||||
+ if (do_plug)
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ /* Submit all the BIOs */
|
||||
+ while (bio != NULL) {
|
||||
+ atomic_inc(&vbio->vbio_ref);
|
||||
+
|
||||
+ struct bio *next = bio->bi_next;
|
||||
+ bio->bi_next = NULL;
|
||||
+
|
||||
+ bio->bi_end_io = vdev_disk_io_rw_completion;
|
||||
+ bio->bi_private = vbio;
|
||||
+ bio_set_op_attrs(bio,
|
||||
+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||
+ WRITE : READ, flags);
|
||||
+
|
||||
+ vdev_submit_bio(bio);
|
||||
+
|
||||
+ bio = next;
|
||||
+ }
|
||||
+
|
||||
+ /* Finish the batch */
|
||||
+ if (do_plug)
|
||||
+ blk_finish_plug(&plug);
|
||||
+
|
||||
+ /* Release the extra reference */
|
||||
+ vbio_put(vbio);
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vbio_return_abd(vbio_t *vbio)
|
||||
+{
|
||||
+ zio_t *zio = vbio->vbio_zio;
|
||||
+ if (vbio->vbio_abd == NULL)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * If we copied the ABD before issuing it, clean up and return the copy
|
||||
+ * to the ADB, with changes if appropriate.
|
||||
+ */
|
||||
+ void *buf = abd_to_buf(vbio->vbio_abd);
|
||||
+ abd_free(vbio->vbio_abd);
|
||||
+ vbio->vbio_abd = NULL;
|
||||
+
|
||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
||||
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||
+ else
|
||||
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vbio_free(vbio_t *vbio)
|
||||
+{
|
||||
+ VERIFY0(atomic_read(&vbio->vbio_ref));
|
||||
+
|
||||
+ vbio_return_abd(vbio);
|
||||
+
|
||||
+ kmem_free(vbio, sizeof (vbio_t));
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+vbio_put(vbio_t *vbio)
|
||||
+{
|
||||
+ if (atomic_dec_return(&vbio->vbio_ref) > 0)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * This was the last reference, so the entire IO is completed. Clean
|
||||
+ * up and submit it for processing.
|
||||
+ */
|
||||
+
|
||||
+ /*
|
||||
+ * Get any data buf back to the original ABD, if necessary. We do this
|
||||
+ * now so we can get the ZIO into the pipeline as quickly as possible,
|
||||
+ * and then do the remaining cleanup after.
|
||||
+ */
|
||||
+ vbio_return_abd(vbio);
|
||||
+
|
||||
+ zio_t *zio = vbio->vbio_zio;
|
||||
+
|
||||
+ /*
|
||||
+ * Set the overall error. If multiple BIOs returned an error, only the
|
||||
+ * first will be taken; the others are dropped (see
|
||||
+ * vdev_disk_io_rw_completion()). Its pretty much impossible for
|
||||
+ * multiple IOs to the same device to fail with different errors, so
|
||||
+ * there's no real risk.
|
||||
+ */
|
||||
+ zio->io_error = vbio->vbio_error;
|
||||
+ if (zio->io_error)
|
||||
+ vdev_disk_error(zio);
|
||||
+
|
||||
+ /* All done, submit for processing */
|
||||
+ zio_delay_interrupt(zio);
|
||||
+
|
||||
+ /* Finish cleanup */
|
||||
+ vbio_free(vbio);
|
||||
+}
|
||||
+
|
||||
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
|
||||
+{
|
||||
+ vbio_t *vbio = bio->bi_private;
|
||||
+
|
||||
+ if (vbio->vbio_error == 0) {
|
||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
+ vbio->vbio_error = BIO_END_IO_ERROR(bio);
|
||||
+#else
|
||||
+ if (error)
|
||||
+ vbio->vbio_error = -(error);
|
||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
+ vbio->vbio_error = EIO;
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Destroy the BIO. This is safe to do; the vbio owns its data and the
|
||||
+ * kernel won't touch it again after the completion function runs.
|
||||
+ */
|
||||
+ bio_put(bio);
|
||||
+
|
||||
+ /* Drop this BIOs reference acquired by vbio_submit() */
|
||||
+ vbio_put(vbio);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Iterator callback to count ABD pages and check their size & alignment.
|
||||
+ *
|
||||
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
|
||||
+ * the data within that page. A page can be arbitrarily large ("compound"
|
||||
+ * pages) but we still have to ensure the data portion is correctly sized and
|
||||
+ * aligned to the logical block size, to ensure that if the kernel wants to
|
||||
+ * split the BIO, the two halves will still be properly aligned.
|
||||
+ */
|
||||
+typedef struct {
|
||||
+ uint_t bmask;
|
||||
+ uint_t npages;
|
||||
+ uint_t end;
|
||||
+} vdev_disk_check_pages_t;
|
||||
+
|
||||
+static int
|
||||
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
+{
|
||||
+ vdev_disk_check_pages_t *s = priv;
|
||||
+
|
||||
+ /*
|
||||
+ * If we didn't finish on a block size boundary last time, then there
|
||||
+ * would be a gap if we tried to use this ABD as-is, so abort.
|
||||
+ */
|
||||
+ if (s->end != 0)
|
||||
+ return (1);
|
||||
+
|
||||
+ /*
|
||||
+ * Note if we're taking less than a full block, so we can check it
|
||||
+ * above on the next call.
|
||||
+ */
|
||||
+ s->end = len & s->bmask;
|
||||
+
|
||||
+ /* All blocks after the first must start on a block size boundary. */
|
||||
+ if (s->npages != 0 && (off & s->bmask) != 0)
|
||||
+ return (1);
|
||||
+
|
||||
+ s->npages++;
|
||||
+ return (0);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
|
||||
+ * the number of pages, or 0 if it can't be submitted like this.
|
||||
+ */
|
||||
+static boolean_t
|
||||
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
||||
+{
|
||||
+ vdev_disk_check_pages_t s = {
|
||||
+ .bmask = bdev_logical_block_size(bdev)-1,
|
||||
+ .npages = 0,
|
||||
+ .end = 0,
|
||||
+ };
|
||||
+
|
||||
+ if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
|
||||
+ return (B_FALSE);
|
||||
+
|
||||
+ return (B_TRUE);
|
||||
+}
|
||||
+
|
||||
+/* Iterator callback to submit ABD pages to the vbio. */
|
||||
+static int
|
||||
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
+{
|
||||
+ vbio_t *vbio = priv;
|
||||
+ return (vbio_add_page(vbio, page, len, off));
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+vdev_disk_io_rw(zio_t *zio)
|
||||
+{
|
||||
+ vdev_t *v = zio->io_vd;
|
||||
+ vdev_disk_t *vd = v->vdev_tsd;
|
||||
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
+ int flags = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Accessing outside the block device is never allowed.
|
||||
+ */
|
||||
+ if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
||||
+ vdev_dbgmsg(zio->io_vd,
|
||||
+ "Illegal access %llu size %llu, device size %llu",
|
||||
+ (u_longlong_t)zio->io_offset,
|
||||
+ (u_longlong_t)zio->io_size,
|
||||
+ (u_longlong_t)i_size_read(bdev->bd_inode));
|
||||
+ return (SET_ERROR(EIO));
|
||||
+ }
|
||||
+
|
||||
+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
+ v->vdev_failfast == B_TRUE) {
|
||||
+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
||||
+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Check alignment of the incoming ABD. If any part of it would require
|
||||
+ * submitting a page that is not aligned to the logical block size,
|
||||
+ * then we take a copy into a linear buffer and submit that instead.
|
||||
+ * This should be impossible on a 512b LBS, and fairly rare on 4K,
|
||||
+ * usually requiring abnormally-small data blocks (eg gang blocks)
|
||||
+ * mixed into the same ABD as larger ones (eg aggregated).
|
||||
+ */
|
||||
+ abd_t *abd = zio->io_abd;
|
||||
+ if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
|
||||
+ void *buf;
|
||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
||||
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
|
||||
+ else
|
||||
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
|
||||
+
|
||||
+ /*
|
||||
+ * Wrap the copy in an abd_t, so we can use the same iterators
|
||||
+ * to count and fill the vbio later.
|
||||
+ */
|
||||
+ abd = abd_get_from_buf(buf, zio->io_size);
|
||||
+
|
||||
+ /*
|
||||
+ * False here would mean the borrowed copy has an invalid
|
||||
+ * alignment too, which would mean we've somehow been passed a
|
||||
+ * linear ABD with an interior page that has a non-zero offset
|
||||
+ * or a size not a multiple of PAGE_SIZE. This is not possible.
|
||||
+ * It would mean either zio_buf_alloc() or its underlying
|
||||
+ * allocators have done something extremely strange, or our
|
||||
+ * math in vdev_disk_check_pages() is wrong. In either case,
|
||||
+ * something in seriously wrong and its not safe to continue.
|
||||
+ */
|
||||
+ VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
|
||||
+ }
|
||||
+
|
||||
+ /* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
||||
+ int error = 0;
|
||||
+ vbio_t *vbio = vbio_alloc(zio, bdev);
|
||||
+ if (abd != zio->io_abd)
|
||||
+ vbio->vbio_abd = abd;
|
||||
+
|
||||
+ /* Fill it with pages */
|
||||
+ error = abd_iterate_page_func(abd, 0, zio->io_size,
|
||||
+ vdev_disk_fill_vbio_cb, vbio);
|
||||
+ if (error != 0) {
|
||||
+ vbio_free(vbio);
|
||||
+ return (error);
|
||||
+ }
|
||||
+
|
||||
+ vbio_submit(vbio, flags);
|
||||
+ return (0);
|
||||
+}
|
||||
+
|
||||
/* ========== */
|
||||
|
||||
/*
|
||||
- * This is the classic, battle-tested BIO submission code.
|
||||
+ * This is the classic, battle-tested BIO submission code. Until we're totally
|
||||
+ * sure that the new code is safe and correct in all cases, this will remain
|
||||
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
|
||||
+ * load time.
|
||||
*
|
||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
||||
* they belong to, but their implementations are unchanged.
|
||||
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||
(void) tsd;
|
||||
|
||||
if (vdev_disk_io_rw_fn == NULL)
|
||||
- vdev_disk_io_rw_fn = vdev_classic_physio;
|
||||
+ /* XXX make configurable */
|
||||
+ vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
||||
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
||||
+
|
||||
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
||||
+ "Maximum number of data segments to add to an IO request (min 4)");
|
104
debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
vendored
Normal file
104
debian/patches/0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
vendored
Normal file
@ -0,0 +1,104 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 9 Jan 2024 13:28:57 +1100
|
||||
Subject: [PATCH] vdev_disk: add module parameter to select BIO submission
|
||||
method
|
||||
|
||||
This makes the submission method selectable at module load time via the
|
||||
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
|
||||
to 2.2 safely, and disabled in favour of the "classic" submission method
|
||||
if new problems come up.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit df2169d141aadc0c2cc728c5c5261d6f5c2a27f7)
|
||||
---
|
||||
man/man4/zfs.4 | 16 ++++++++++++++++
|
||||
module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
|
||||
2 files changed, 45 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
||||
index b5679f2f0..6a628e7f3 100644
|
||||
--- a/man/man4/zfs.4
|
||||
+++ b/man/man4/zfs.4
|
||||
@@ -1352,6 +1352,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
|
||||
itself, it will be clamped.
|
||||
Setting it to zero will cause the kernel's ideal size to be used.
|
||||
This parameter only applies on Linux.
|
||||
+This parameter is ignored if
|
||||
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
||||
+.
|
||||
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
|
||||
+and earlier.
|
||||
+This "classic" method has known issues with highly fragmented IO requests and
|
||||
+is slower on many workloads, but it has been in use for many years and is known
|
||||
+to be very stable.
|
||||
+If you set this parameter, please also open a bug report why you did so,
|
||||
+including the workload involved and any error messages.
|
||||
+.Pp
|
||||
+This parameter and the classic submission method will be removed once we have
|
||||
+total confidence in the new method.
|
||||
+.Pp
|
||||
+This parameter only applies on Linux, and can only be set at module load time.
|
||||
.
|
||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
||||
Time before expiring
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index 0ccb9ad96..a9110623a 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
|
||||
/* XXX: Implement me as a vnode rele for the device */
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * BIO submission method. See comment above about vdev_classic.
|
||||
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
||||
+ */
|
||||
+static uint_t zfs_vdev_disk_classic = 0; /* default new */
|
||||
+
|
||||
+/* Set submission function from module parameter */
|
||||
+static int
|
||||
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
|
||||
+{
|
||||
+ int err = param_set_uint(buf, kp);
|
||||
+ if (err < 0)
|
||||
+ return (SET_ERROR(err));
|
||||
+
|
||||
+ vdev_disk_io_rw_fn =
|
||||
+ zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
|
||||
+
|
||||
+ printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
|
||||
+ zfs_vdev_disk_classic ? "classic" : "new");
|
||||
+
|
||||
+ return (0);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* At first use vdev use, set the submission function from the default value if
|
||||
* it hasn't been set already.
|
||||
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||
(void) tsd;
|
||||
|
||||
if (vdev_disk_io_rw_fn == NULL)
|
||||
- /* XXX make configurable */
|
||||
- vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
|
||||
+ vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
|
||||
+ vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
||||
"Maximum number of data segments to add to an IO request (min 4)");
|
||||
+
|
||||
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
|
||||
+ vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
|
||||
+ "Use classic BIO submission method");
|
363
debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
vendored
Normal file
363
debian/patches/0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
vendored
Normal file
@ -0,0 +1,363 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Wed, 21 Feb 2024 11:07:21 +1100
|
||||
Subject: [PATCH] vdev_disk: use bio_chain() to submit multiple BIOs
|
||||
|
||||
Simplifies our code a lot, so we don't have to wait for each and
|
||||
reassemble them.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit 72fd834c47558cb10d847948d1a4615e894c77c3)
|
||||
---
|
||||
module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
|
||||
1 file changed, 80 insertions(+), 151 deletions(-)
|
||||
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index a9110623a..36468fc21 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
|
||||
if (v->vdev_reopening || vd == NULL)
|
||||
return;
|
||||
|
||||
- if (vd->vd_bdh != NULL) {
|
||||
+ if (vd->vd_bdh != NULL)
|
||||
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
||||
zfs_vdev_holder);
|
||||
- }
|
||||
|
||||
rw_destroy(&vd->vd_lock);
|
||||
kmem_free(vd, sizeof (vdev_disk_t));
|
||||
@@ -663,9 +662,6 @@ typedef struct {
|
||||
|
||||
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
||||
|
||||
- atomic_t vbio_ref; /* bio refcount */
|
||||
- int vbio_error; /* error from failed bio */
|
||||
-
|
||||
uint_t vbio_max_segs; /* max segs per bio */
|
||||
|
||||
uint_t vbio_max_bytes; /* max bytes per bio */
|
||||
@@ -674,43 +670,52 @@ typedef struct {
|
||||
uint64_t vbio_offset; /* start offset of next bio */
|
||||
|
||||
struct bio *vbio_bio; /* pointer to the current bio */
|
||||
- struct bio *vbio_bios; /* list of all bios */
|
||||
+ int vbio_flags; /* bio flags */
|
||||
} vbio_t;
|
||||
|
||||
static vbio_t *
|
||||
-vbio_alloc(zio_t *zio, struct block_device *bdev)
|
||||
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
|
||||
{
|
||||
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
||||
|
||||
vbio->vbio_zio = zio;
|
||||
vbio->vbio_bdev = bdev;
|
||||
- atomic_set(&vbio->vbio_ref, 0);
|
||||
+ vbio->vbio_abd = NULL;
|
||||
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
||||
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
||||
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
||||
vbio->vbio_offset = zio->io_offset;
|
||||
+ vbio->vbio_bio = NULL;
|
||||
+ vbio->vbio_flags = flags;
|
||||
|
||||
return (vbio);
|
||||
}
|
||||
|
||||
+BIO_END_IO_PROTO(vbio_completion, bio, error);
|
||||
+
|
||||
static int
|
||||
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||
{
|
||||
- struct bio *bio;
|
||||
+ struct bio *bio = vbio->vbio_bio;
|
||||
uint_t ssize;
|
||||
|
||||
while (size > 0) {
|
||||
- bio = vbio->vbio_bio;
|
||||
if (bio == NULL) {
|
||||
/* New BIO, allocate and set up */
|
||||
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
||||
vbio->vbio_max_segs);
|
||||
- if (unlikely(bio == NULL))
|
||||
- return (SET_ERROR(ENOMEM));
|
||||
+ VERIFY(bio);
|
||||
+
|
||||
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
||||
+ bio_set_op_attrs(bio,
|
||||
+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||
+ WRITE : READ, vbio->vbio_flags);
|
||||
|
||||
- bio->bi_next = vbio->vbio_bios;
|
||||
- vbio->vbio_bios = vbio->vbio_bio = bio;
|
||||
+ if (vbio->vbio_bio) {
|
||||
+ bio_chain(vbio->vbio_bio, bio);
|
||||
+ vdev_submit_bio(vbio->vbio_bio);
|
||||
+ }
|
||||
+ vbio->vbio_bio = bio;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||
vbio->vbio_offset += BIO_BI_SIZE(bio);
|
||||
|
||||
/* Signal new BIO allocation wanted */
|
||||
- vbio->vbio_bio = NULL;
|
||||
+ bio = NULL;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
|
||||
-static void vbio_put(vbio_t *vbio);
|
||||
+/* Iterator callback to submit ABD pages to the vbio. */
|
||||
+static int
|
||||
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
+{
|
||||
+ vbio_t *vbio = priv;
|
||||
+ return (vbio_add_page(vbio, page, len, off));
|
||||
+}
|
||||
|
||||
+/* Create some BIOs, fill them with data and submit them */
|
||||
static void
|
||||
-vbio_submit(vbio_t *vbio, int flags)
|
||||
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||
{
|
||||
- ASSERT(vbio->vbio_bios);
|
||||
- struct bio *bio = vbio->vbio_bios;
|
||||
- vbio->vbio_bio = vbio->vbio_bios = NULL;
|
||||
-
|
||||
- /*
|
||||
- * We take a reference for each BIO as we submit it, plus one to
|
||||
- * protect us from BIOs completing before we're done submitting them
|
||||
- * all, causing vbio_put() to free vbio out from under us and/or the
|
||||
- * zio to be returned before all its IO has completed.
|
||||
- */
|
||||
- atomic_set(&vbio->vbio_ref, 1);
|
||||
+ ASSERT(vbio->vbio_bdev);
|
||||
|
||||
/*
|
||||
- * If we're submitting more than one BIO, inform the block layer so
|
||||
- * it can batch them if it wants.
|
||||
+ * We plug so we can submit the BIOs as we go and only unplug them when
|
||||
+ * they are fully created and submitted. This is important; if we don't
|
||||
+ * plug, then the kernel may start executing earlier BIOs while we're
|
||||
+ * still creating and executing later ones, and if the device goes
|
||||
+ * away while that's happening, older kernels can get confused and
|
||||
+ * trample memory.
|
||||
*/
|
||||
struct blk_plug plug;
|
||||
- boolean_t do_plug = (bio->bi_next != NULL);
|
||||
- if (do_plug)
|
||||
- blk_start_plug(&plug);
|
||||
+ blk_start_plug(&plug);
|
||||
|
||||
- /* Submit all the BIOs */
|
||||
- while (bio != NULL) {
|
||||
- atomic_inc(&vbio->vbio_ref);
|
||||
+ (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
|
||||
+ ASSERT(vbio->vbio_bio);
|
||||
|
||||
- struct bio *next = bio->bi_next;
|
||||
- bio->bi_next = NULL;
|
||||
+ vbio->vbio_bio->bi_end_io = vbio_completion;
|
||||
+ vbio->vbio_bio->bi_private = vbio;
|
||||
|
||||
- bio->bi_end_io = vdev_disk_io_rw_completion;
|
||||
- bio->bi_private = vbio;
|
||||
- bio_set_op_attrs(bio,
|
||||
- vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||
- WRITE : READ, flags);
|
||||
+ vdev_submit_bio(vbio->vbio_bio);
|
||||
|
||||
- vdev_submit_bio(bio);
|
||||
-
|
||||
- bio = next;
|
||||
- }
|
||||
-
|
||||
- /* Finish the batch */
|
||||
- if (do_plug)
|
||||
- blk_finish_plug(&plug);
|
||||
+ blk_finish_plug(&plug);
|
||||
|
||||
- /* Release the extra reference */
|
||||
- vbio_put(vbio);
|
||||
+ vbio->vbio_bio = NULL;
|
||||
+ vbio->vbio_bdev = NULL;
|
||||
}
|
||||
|
||||
-static void
|
||||
-vbio_return_abd(vbio_t *vbio)
|
||||
+/* IO completion callback */
|
||||
+BIO_END_IO_PROTO(vbio_completion, bio, error)
|
||||
{
|
||||
+ vbio_t *vbio = bio->bi_private;
|
||||
zio_t *zio = vbio->vbio_zio;
|
||||
- if (vbio->vbio_abd == NULL)
|
||||
- return;
|
||||
-
|
||||
- /*
|
||||
- * If we copied the ABD before issuing it, clean up and return the copy
|
||||
- * to the ADB, with changes if appropriate.
|
||||
- */
|
||||
- void *buf = abd_to_buf(vbio->vbio_abd);
|
||||
- abd_free(vbio->vbio_abd);
|
||||
- vbio->vbio_abd = NULL;
|
||||
-
|
||||
- if (zio->io_type == ZIO_TYPE_READ)
|
||||
- abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||
- else
|
||||
- abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||
-}
|
||||
|
||||
-static void
|
||||
-vbio_free(vbio_t *vbio)
|
||||
-{
|
||||
- VERIFY0(atomic_read(&vbio->vbio_ref));
|
||||
-
|
||||
- vbio_return_abd(vbio);
|
||||
+ ASSERT(zio);
|
||||
|
||||
- kmem_free(vbio, sizeof (vbio_t));
|
||||
-}
|
||||
+ /* Capture and log any errors */
|
||||
+#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
+ zio->io_error = BIO_END_IO_ERROR(bio);
|
||||
+#else
|
||||
+ zio->io_error = 0;
|
||||
+ if (error)
|
||||
+ zio->io_error = -(error);
|
||||
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
+ zio->io_error = EIO;
|
||||
+#endif
|
||||
+ ASSERT3U(zio->io_error, >=, 0);
|
||||
|
||||
-static void
|
||||
-vbio_put(vbio_t *vbio)
|
||||
-{
|
||||
- if (atomic_dec_return(&vbio->vbio_ref) > 0)
|
||||
- return;
|
||||
+ if (zio->io_error)
|
||||
+ vdev_disk_error(zio);
|
||||
|
||||
- /*
|
||||
- * This was the last reference, so the entire IO is completed. Clean
|
||||
- * up and submit it for processing.
|
||||
- */
|
||||
+ /* Return the BIO to the kernel */
|
||||
+ bio_put(bio);
|
||||
|
||||
/*
|
||||
- * Get any data buf back to the original ABD, if necessary. We do this
|
||||
- * now so we can get the ZIO into the pipeline as quickly as possible,
|
||||
- * and then do the remaining cleanup after.
|
||||
+ * If we copied the ABD before issuing it, clean up and return the copy
|
||||
+ * to the ADB, with changes if appropriate.
|
||||
*/
|
||||
- vbio_return_abd(vbio);
|
||||
+ if (vbio->vbio_abd != NULL) {
|
||||
+ void *buf = abd_to_buf(vbio->vbio_abd);
|
||||
+ abd_free(vbio->vbio_abd);
|
||||
+ vbio->vbio_abd = NULL;
|
||||
|
||||
- zio_t *zio = vbio->vbio_zio;
|
||||
+ if (zio->io_type == ZIO_TYPE_READ)
|
||||
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||
+ else
|
||||
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||
+ }
|
||||
|
||||
- /*
|
||||
- * Set the overall error. If multiple BIOs returned an error, only the
|
||||
- * first will be taken; the others are dropped (see
|
||||
- * vdev_disk_io_rw_completion()). Its pretty much impossible for
|
||||
- * multiple IOs to the same device to fail with different errors, so
|
||||
- * there's no real risk.
|
||||
- */
|
||||
- zio->io_error = vbio->vbio_error;
|
||||
- if (zio->io_error)
|
||||
- vdev_disk_error(zio);
|
||||
+ /* Final cleanup */
|
||||
+ kmem_free(vbio, sizeof (vbio_t));
|
||||
|
||||
/* All done, submit for processing */
|
||||
zio_delay_interrupt(zio);
|
||||
-
|
||||
- /* Finish cleanup */
|
||||
- vbio_free(vbio);
|
||||
-}
|
||||
-
|
||||
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
|
||||
-{
|
||||
- vbio_t *vbio = bio->bi_private;
|
||||
-
|
||||
- if (vbio->vbio_error == 0) {
|
||||
-#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
- vbio->vbio_error = BIO_END_IO_ERROR(bio);
|
||||
-#else
|
||||
- if (error)
|
||||
- vbio->vbio_error = -(error);
|
||||
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
- vbio->vbio_error = EIO;
|
||||
-#endif
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * Destroy the BIO. This is safe to do; the vbio owns its data and the
|
||||
- * kernel won't touch it again after the completion function runs.
|
||||
- */
|
||||
- bio_put(bio);
|
||||
-
|
||||
- /* Drop this BIOs reference acquired by vbio_submit() */
|
||||
- vbio_put(vbio);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
-/* Iterator callback to submit ABD pages to the vbio. */
|
||||
-static int
|
||||
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
-{
|
||||
- vbio_t *vbio = priv;
|
||||
- return (vbio_add_page(vbio, page, len, off));
|
||||
-}
|
||||
-
|
||||
static int
|
||||
vdev_disk_io_rw(zio_t *zio)
|
||||
{
|
||||
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
|
||||
}
|
||||
|
||||
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
||||
- int error = 0;
|
||||
- vbio_t *vbio = vbio_alloc(zio, bdev);
|
||||
+ vbio_t *vbio = vbio_alloc(zio, bdev, flags);
|
||||
if (abd != zio->io_abd)
|
||||
vbio->vbio_abd = abd;
|
||||
|
||||
- /* Fill it with pages */
|
||||
- error = abd_iterate_page_func(abd, 0, zio->io_size,
|
||||
- vdev_disk_fill_vbio_cb, vbio);
|
||||
- if (error != 0) {
|
||||
- vbio_free(vbio);
|
||||
- return (error);
|
||||
- }
|
||||
-
|
||||
- vbio_submit(vbio, flags);
|
||||
+ /* Fill it with data pages and submit it to the kernel */
|
||||
+ vbio_submit(vbio, abd, zio->io_size);
|
||||
return (0);
|
||||
}
|
||||
|
96
debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
vendored
Normal file
96
debian/patches/0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
vendored
Normal file
@ -0,0 +1,96 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Thu, 14 Mar 2024 10:57:30 +1100
|
||||
Subject: [PATCH] abd_iter_page: don't use compound heads on Linux <4.5
|
||||
|
||||
Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
|
||||
in a compound page were refcounted separately. This means that using the
|
||||
head page without taking a reference to it could see it cleaned up later
|
||||
before we're finished with it. Specifically, bio_add_page() would take a
|
||||
reference, and drop its reference after the bio completion callback
|
||||
returns.
|
||||
|
||||
If the zio is executed immediately from the completion callback, this is
|
||||
usually ok, as any data is referenced through the tail page referenced
|
||||
by the ABD, and so becomes "live" that way. If there's a delay in zio
|
||||
execution (high load, error injection), then the head page can be freed,
|
||||
along with any dirty flags or other indicators that the underlying
|
||||
memory is used. Later, when the zio completes and that memory is
|
||||
accessed, its either unmapped and an unhandled fault takes down the
|
||||
entire system, or it is mapped and we end up messing around in someone
|
||||
else's memory. Both of these are very bad.
|
||||
|
||||
The solution on these older kernels is to take a reference to the head
|
||||
page when we use it, and release it when we're done. There's not really
|
||||
a sensible way under our current structure to do this; the "best" would
|
||||
be to keep a list of head page references in the ABD, and release them
|
||||
when the ABD is freed.
|
||||
|
||||
Since this additional overhead is totally unnecessary on 4.5+, where
|
||||
head and tail pages share refcounts, I've opted to simply not use the
|
||||
compound head in ABD page iteration there. This is theoretically less
|
||||
efficient (though cleaning up head page references would add overhead),
|
||||
but its safe, and we still get the other benefits of not mapping pages
|
||||
before adding them to a bio and not mis-splitting pages.
|
||||
|
||||
There doesn't appear to be an obvious symbol name or config option we
|
||||
can match on to discover this behaviour in configure (and the mm/page
|
||||
APIs have changed a lot since then anyway), so I've gone with a simple
|
||||
version check.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
Closes #15533
|
||||
Closes #15588
|
||||
(cherry picked from commit c6be6ce1755a3d9a3cbe70256cd8958ef83d8542)
|
||||
---
|
||||
module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
||||
index 3fe01c0b7..d3255dcbc 100644
|
||||
--- a/module/os/linux/zfs/abd_os.c
|
||||
+++ b/module/os/linux/zfs/abd_os.c
|
||||
@@ -62,6 +62,7 @@
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/mm_compat.h>
|
||||
#include <linux/scatterlist.h>
|
||||
+#include <linux/version.h>
|
||||
#endif
|
||||
|
||||
#ifdef _KERNEL
|
||||
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
}
|
||||
ASSERT(page);
|
||||
|
||||
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||
if (PageTail(page)) {
|
||||
/*
|
||||
* This page is part of a "compound page", which is a group of
|
||||
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
* To do this, we need to adjust the offset to be counted from
|
||||
* the head page. struct page for compound pages are stored
|
||||
* contiguously, so we can just adjust by a simple offset.
|
||||
+ *
|
||||
+ * Before kernel 4.5, compound page heads were refcounted
|
||||
+ * separately, such that moving back to the head page would
|
||||
+ * require us to take a reference to it and releasing it once
|
||||
+ * we're completely finished with it. In practice, that means
|
||||
+ * when our caller is done with the ABD, which we have no
|
||||
+ * insight into from here. Rather than contort this API to
|
||||
+ * track head page references on such ancient kernels, we just
|
||||
+ * compile this block out and use the tail pages directly. This
|
||||
+ * is slightly less efficient, but makes everything far
|
||||
+ * simpler.
|
||||
*/
|
||||
struct page *head = compound_head(page);
|
||||
doff += ((page - head) * PAGESIZE);
|
||||
page = head;
|
||||
}
|
||||
+#endif
|
||||
|
||||
/* final page and position within it */
|
||||
aiter->iter_page = page;
|
90
debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
vendored
Normal file
90
debian/patches/0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Wed, 27 Mar 2024 13:11:12 +1100
|
||||
Subject: [PATCH] vdev_disk: default to classic submission for 2.2.x
|
||||
|
||||
We don't want to change to brand-new code in the middle of a stable
|
||||
series, but we want it available to test for people running into page
|
||||
splitting issues.
|
||||
|
||||
This commits make zfs_vdev_disk_classic=1 the default, and updates the
|
||||
documentation to better explain what's going on.
|
||||
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
---
|
||||
man/man4/zfs.4 | 31 ++++++++++++++++++++++---------
|
||||
module/os/linux/zfs/vdev_disk.c | 8 +++++---
|
||||
2 files changed, 27 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
|
||||
index 6a628e7f3..a98ec519a 100644
|
||||
--- a/man/man4/zfs.4
|
||||
+++ b/man/man4/zfs.4
|
||||
@@ -1355,17 +1355,30 @@ This parameter only applies on Linux.
|
||||
This parameter is ignored if
|
||||
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
||||
.
|
||||
-.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||
-If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
|
||||
-and earlier.
|
||||
-This "classic" method has known issues with highly fragmented IO requests and
|
||||
-is slower on many workloads, but it has been in use for many years and is known
|
||||
-to be very stable.
|
||||
-If you set this parameter, please also open a bug report why you did so,
|
||||
+.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
|
||||
+Controls the method used to submit IO to the Linux block layer
|
||||
+(default
|
||||
+.Sy 1 "classic" Ns
|
||||
+)
|
||||
+.Pp
|
||||
+If set to 1, the "classic" method is used.
|
||||
+This is the method that has been in use since the earliest versions of
|
||||
+ZFS-on-Linux.
|
||||
+It has known issues with highly fragmented IO requests and is less efficient on
|
||||
+many workloads, but it well known and well understood.
|
||||
+.Pp
|
||||
+If set to 0, the "new" method is used.
|
||||
+This method is available since 2.2.4 and should resolve all known issues and be
|
||||
+far more efficient, but has not had as much testing.
|
||||
+In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
|
||||
+.Pp
|
||||
+It is not recommended that you change it except on advice from the OpenZFS
|
||||
+developers.
|
||||
+If you do change it, please also open a bug report describing why you did so,
|
||||
including the workload involved and any error messages.
|
||||
.Pp
|
||||
-This parameter and the classic submission method will be removed once we have
|
||||
-total confidence in the new method.
|
||||
+This parameter and the "classic" submission method will be removed in a future
|
||||
+release of OpenZFS once we have total confidence in the new method.
|
||||
.Pp
|
||||
This parameter only applies on Linux, and can only be set at module load time.
|
||||
.
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index 36468fc21..e1c19a085 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -969,8 +969,10 @@ vdev_disk_io_rw(zio_t *zio)
|
||||
/*
|
||||
* This is the classic, battle-tested BIO submission code. Until we're totally
|
||||
* sure that the new code is safe and correct in all cases, this will remain
|
||||
- * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
|
||||
- * load time.
|
||||
+ * available.
|
||||
+ *
|
||||
+ * It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
|
||||
+ * enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
|
||||
*
|
||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
||||
* they belong to, but their implementations are unchanged.
|
||||
@@ -1468,7 +1470,7 @@ vdev_disk_rele(vdev_t *vd)
|
||||
* BIO submission method. See comment above about vdev_classic.
|
||||
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
||||
*/
|
||||
-static uint_t zfs_vdev_disk_classic = 0; /* default new */
|
||||
+static uint_t zfs_vdev_disk_classic = 1; /* default classic */
|
||||
|
||||
/* Set submission function from module parameter */
|
||||
static int
|
104
debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
vendored
Normal file
104
debian/patches/0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
vendored
Normal file
@ -0,0 +1,104 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Robert Evans <rrevans@gmail.com>
|
||||
Date: Mon, 25 Mar 2024 17:56:49 -0400
|
||||
Subject: [PATCH] Fix corruption caused by mmap flushing problems
|
||||
|
||||
1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
|
||||
already in writeback unless data-integrity sync is requested.
|
||||
|
||||
2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
|
||||
skipped due to DMU pushing back on TX assign.
|
||||
|
||||
3) Add missing mmap flush when doing block cloning.
|
||||
|
||||
4) While here, pass errors from putpage to writepage/writepages.
|
||||
|
||||
This change fixes corruption edge cases, but unfortunately adds
|
||||
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
|
||||
operations. It may be possible to avoid these sync writes later
|
||||
but would need more tricky refactoring of the writeback code.
|
||||
|
||||
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||||
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||||
Signed-off-by: Robert Evans <evansr@google.com>
|
||||
Closes #15933
|
||||
Closes #16019
|
||||
---
|
||||
module/os/linux/zfs/zfs_vnops_os.c | 5 +----
|
||||
module/os/linux/zfs/zpl_file.c | 8 ++++----
|
||||
module/zfs/zfs_vnops.c | 6 +++++-
|
||||
3 files changed, 10 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
|
||||
index c06a75662..7c473bc7e 100644
|
||||
--- a/module/os/linux/zfs/zfs_vnops_os.c
|
||||
+++ b/module/os/linux/zfs/zfs_vnops_os.c
|
||||
@@ -3792,11 +3792,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
||||
zfs_sa_upgrade_txholds(tx, zp);
|
||||
|
||||
- err = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
+ err = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (err != 0) {
|
||||
- if (err == ERESTART)
|
||||
- dmu_tx_wait(tx);
|
||||
-
|
||||
dmu_tx_abort(tx);
|
||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
||||
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
|
||||
index 3caa0fc6c..9dec52215 100644
|
||||
--- a/module/os/linux/zfs/zpl_file.c
|
||||
+++ b/module/os/linux/zfs/zpl_file.c
|
||||
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
boolean_t *for_sync = data;
|
||||
fstrans_cookie_t cookie;
|
||||
+ int ret;
|
||||
|
||||
ASSERT(PageLocked(pp));
|
||||
ASSERT(!PageWriteback(pp));
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
- (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
+ ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
- return (0);
|
||||
+ return (ret);
|
||||
}
|
||||
|
||||
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
||||
static int
|
||||
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
- (void) zpl_putpage(&pp->page, wbc, data);
|
||||
- return (0);
|
||||
+ return (zpl_putpage(&pp->page, wbc, data));
|
||||
}
|
||||
#endif
|
||||
|
||||
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
|
||||
index 2b37834d5..7020f88ec 100644
|
||||
--- a/module/zfs/zfs_vnops.c
|
||||
+++ b/module/zfs/zfs_vnops.c
|
||||
@@ -130,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
||||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
||||
- zn_flush_cached_data(zp, B_FALSE);
|
||||
+ zn_flush_cached_data(zp, B_TRUE);
|
||||
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
||||
@@ -1193,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||
}
|
||||
}
|
||||
|
||||
+ /* Flush any mmap()'d data to disk */
|
||||
+ if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
|
||||
+ zn_flush_cached_data(inzp, B_TRUE);
|
||||
+
|
||||
/*
|
||||
* Maintain predictable lock order.
|
||||
*/
|
57
debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
vendored
Normal file
57
debian/patches/0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Norris <rob.norris@klarasystems.com>
|
||||
Date: Tue, 2 Apr 2024 15:14:54 +1100
|
||||
Subject: [PATCH] vdev_disk: don't touch vbio after its handed off to the
|
||||
kernel
|
||||
|
||||
After IO is unplugged, it may complete immediately and vbio_completion
|
||||
be called on interrupt context. That may interrupt or deschedule our
|
||||
task. If its the last bio, the vbio will be freed. Then, we get
|
||||
rescheduled, and try to write to freed memory through vbio->.
|
||||
|
||||
This patch just removes the the cleanup, and the corresponding assert.
|
||||
These were leftovers from a previous iteration of vbio_submit() and were
|
||||
always "belt and suspenders" ops anyway, never strictly required.
|
||||
|
||||
Reported-by: Rich Ercolani <rincebrain@gmail.com>
|
||||
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||||
Sponsored-by: Klara, Inc.
|
||||
Sponsored-by: Wasabi Technology, Inc.
|
||||
(cherry picked from commit 34f662ad22206af6852020fd923ceccd836a855f)
|
||||
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
|
||||
---
|
||||
module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
|
||||
1 file changed, 6 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
|
||||
index e1c19a085..62c7aa14f 100644
|
||||
--- a/module/os/linux/zfs/vdev_disk.c
|
||||
+++ b/module/os/linux/zfs/vdev_disk.c
|
||||
@@ -758,8 +758,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
static void
|
||||
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||
{
|
||||
- ASSERT(vbio->vbio_bdev);
|
||||
-
|
||||
/*
|
||||
* We plug so we can submit the BIOs as we go and only unplug them when
|
||||
* they are fully created and submitted. This is important; if we don't
|
||||
@@ -777,12 +775,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||
vbio->vbio_bio->bi_end_io = vbio_completion;
|
||||
vbio->vbio_bio->bi_private = vbio;
|
||||
|
||||
+ /*
|
||||
+ * Once submitted, vbio_bio now owns vbio (through bi_private) and we
|
||||
+ * can't touch it again. The bio may complete and vbio_completion() be
|
||||
+ * called and free the vbio before this task is run again, so we must
|
||||
+ * consider it invalid from this point.
|
||||
+ */
|
||||
vdev_submit_bio(vbio->vbio_bio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
-
|
||||
- vbio->vbio_bio = NULL;
|
||||
- vbio->vbio_bdev = NULL;
|
||||
}
|
||||
|
||||
/* IO completion callback */
|
13
debian/patches/series
vendored
13
debian/patches/series
vendored
@ -10,3 +10,16 @@
|
||||
0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
|
||||
0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
|
||||
0012-udev-correctly-handle-partition-16-and-later.patch
|
||||
0013-Linux-6.8-compat-use-splice_copy_file_range-for-fall.patch
|
||||
0014-linux-5.4-compat-page_size.patch
|
||||
0015-abd-add-page-iterator.patch
|
||||
0016-vdev_disk-rename-existing-functions-to-vdev_classic_.patch
|
||||
0017-vdev_disk-reorganise-vdev_disk_io_start.patch
|
||||
0018-vdev_disk-make-read-write-IO-function-configurable.patch
|
||||
0019-vdev_disk-rewrite-BIO-filling-machinery-to-avoid-spl.patch
|
||||
0020-vdev_disk-add-module-parameter-to-select-BIO-submiss.patch
|
||||
0021-vdev_disk-use-bio_chain-to-submit-multiple-BIOs.patch
|
||||
0022-abd_iter_page-don-t-use-compound-heads-on-Linux-4.5.patch
|
||||
0023-vdev_disk-default-to-classic-submission-for-2.2.x.patch
|
||||
0024-Fix-corruption-caused-by-mmap-flushing-problems.patch
|
||||
0025-vdev_disk-don-t-touch-vbio-after-its-handed-off-to-t.patch
|
||||
|
Loading…
Reference in New Issue
Block a user