335 lines
9.8 KiB
Diff
335 lines
9.8 KiB
Diff
![]() |
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||
|
From: Rob Norris <rob.norris@klarasystems.com>
|
||
|
Date: Mon, 11 Dec 2023 16:05:54 +1100
|
||
|
Subject: [PATCH] abd: add page iterator
|
||
|
|
||
|
The regular ABD iterators yield data buffers, so they have to map and
|
||
|
unmap pages into kernel memory. If the caller only wants to count
|
||
|
chunks, or can use page pointers directly, then the map/unmap is just
|
||
|
unnecessary overhead.
|
||
|
|
||
|
This adds adb_iterate_page_func, which yields unmapped struct page
|
||
|
instead.
|
||
|
|
||
|
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
|
||
|
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
|
||
|
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
|
||
|
Sponsored-by: Klara, Inc.
|
||
|
Sponsored-by: Wasabi Technology, Inc.
|
||
|
Closes #15533
|
||
|
Closes #15588
|
||
|
(cherry picked from commit 390b448726c580999dd337be7a40b0e95cf1d50b)
|
||
|
---
|
||
|
include/sys/abd.h | 7 +++
|
||
|
include/sys/abd_impl.h | 26 ++++++++-
|
||
|
module/os/freebsd/zfs/abd_os.c | 4 +-
|
||
|
module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++---
|
||
|
module/zfs/abd.c | 42 +++++++++++++
|
||
|
5 files changed, 169 insertions(+), 14 deletions(-)
|
||
|
|
||
|
diff --git a/include/sys/abd.h b/include/sys/abd.h
|
||
|
index 750f9986c..8a2df0bca 100644
|
||
|
--- a/include/sys/abd.h
|
||
|
+++ b/include/sys/abd.h
|
||
|
@@ -79,6 +79,9 @@ typedef struct abd {
|
||
|
|
||
|
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||
|
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||
|
+#if defined(__linux__) && defined(_KERNEL)
|
||
|
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||
|
+#endif
|
||
|
|
||
|
extern int zfs_abd_scatter_enabled;
|
||
|
|
||
|
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
||
|
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||
|
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||
|
abd_iter_func2_t *, void *);
|
||
|
+#if defined(__linux__) && defined(_KERNEL)
|
||
|
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||
|
+ void *);
|
||
|
+#endif
|
||
|
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||
|
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||
|
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||
|
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
|
||
|
index 40546d4af..f88ea25e2 100644
|
||
|
--- a/include/sys/abd_impl.h
|
||
|
+++ b/include/sys/abd_impl.h
|
||
|
@@ -21,6 +21,7 @@
|
||
|
/*
|
||
|
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||
|
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||
|
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||
|
*/
|
||
|
|
||
|
#ifndef _ABD_IMPL_H
|
||
|
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
||
|
ABDSTAT_DECR /* Decrease abdstat values */
|
||
|
} abd_stats_op_t;
|
||
|
|
||
|
-struct scatterlist; /* forward declaration */
|
||
|
+/* forward declarations */
|
||
|
+struct scatterlist;
|
||
|
+struct page;
|
||
|
|
||
|
struct abd_iter {
|
||
|
/* public interface */
|
||
|
- void *iter_mapaddr; /* addr corresponding to iter_pos */
|
||
|
- size_t iter_mapsize; /* length of data valid at mapaddr */
|
||
|
+ union {
|
||
|
+ /* for abd_iter_map()/abd_iter_unmap() */
|
||
|
+ struct {
|
||
|
+ /* addr corresponding to iter_pos */
|
||
|
+ void *iter_mapaddr;
|
||
|
+ /* length of data valid at mapaddr */
|
||
|
+ size_t iter_mapsize;
|
||
|
+ };
|
||
|
+ /* for abd_iter_page() */
|
||
|
+ struct {
|
||
|
+ /* current page */
|
||
|
+ struct page *iter_page;
|
||
|
+ /* offset of data in page */
|
||
|
+ size_t iter_page_doff;
|
||
|
+ /* size of data in page */
|
||
|
+ size_t iter_page_dsize;
|
||
|
+ };
|
||
|
+ };
|
||
|
|
||
|
/* private */
|
||
|
abd_t *iter_abd; /* ABD being iterated through */
|
||
|
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
||
|
void abd_iter_advance(struct abd_iter *, size_t);
|
||
|
void abd_iter_map(struct abd_iter *);
|
||
|
void abd_iter_unmap(struct abd_iter *);
|
||
|
+void abd_iter_page(struct abd_iter *);
|
||
|
|
||
|
/*
|
||
|
* Helper macros
|
||
|
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
|
||
|
index 58a37df62..3b812271f 100644
|
||
|
--- a/module/os/freebsd/zfs/abd_os.c
|
||
|
+++ b/module/os/freebsd/zfs/abd_os.c
|
||
|
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||
|
{
|
||
|
ASSERT(!abd_is_gang(abd));
|
||
|
abd_verify(abd);
|
||
|
+ memset(aiter, 0, sizeof (struct abd_iter));
|
||
|
aiter->iter_abd = abd;
|
||
|
- aiter->iter_pos = 0;
|
||
|
- aiter->iter_mapaddr = NULL;
|
||
|
- aiter->iter_mapsize = 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
|
||
|
index 24390fbbf..dae128012 100644
|
||
|
--- a/module/os/linux/zfs/abd_os.c
|
||
|
+++ b/module/os/linux/zfs/abd_os.c
|
||
|
@@ -21,6 +21,7 @@
|
||
|
/*
|
||
|
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||
|
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||
|
+ * Copyright (c) 2023, 2024, Klara Inc.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
@@ -59,6 +60,7 @@
|
||
|
#include <sys/zfs_znode.h>
|
||
|
#ifdef _KERNEL
|
||
|
#include <linux/kmap_compat.h>
|
||
|
+#include <linux/mm_compat.h>
|
||
|
#include <linux/scatterlist.h>
|
||
|
#endif
|
||
|
|
||
|
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||
|
{
|
||
|
ASSERT(!abd_is_gang(abd));
|
||
|
abd_verify(abd);
|
||
|
+ memset(aiter, 0, sizeof (struct abd_iter));
|
||
|
aiter->iter_abd = abd;
|
||
|
- aiter->iter_mapaddr = NULL;
|
||
|
- aiter->iter_mapsize = 0;
|
||
|
- aiter->iter_pos = 0;
|
||
|
- if (abd_is_linear(abd)) {
|
||
|
- aiter->iter_offset = 0;
|
||
|
- aiter->iter_sg = NULL;
|
||
|
- } else {
|
||
|
+ if (!abd_is_linear(abd)) {
|
||
|
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
||
|
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
||
|
}
|
||
|
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||
|
boolean_t
|
||
|
abd_iter_at_end(struct abd_iter *aiter)
|
||
|
{
|
||
|
+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||
|
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||
|
}
|
||
|
|
||
|
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
||
|
void
|
||
|
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||
|
{
|
||
|
+ /*
|
||
|
+ * Ensure that last chunk is not in use. abd_iterate_*() must clear
|
||
|
+ * this state (directly or abd_iter_unmap()) before advancing.
|
||
|
+ */
|
||
|
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||
|
ASSERT0(aiter->iter_mapsize);
|
||
|
+ ASSERT3P(aiter->iter_page, ==, NULL);
|
||
|
+ ASSERT0(aiter->iter_page_doff);
|
||
|
+ ASSERT0(aiter->iter_page_dsize);
|
||
|
|
||
|
/* There's nothing left to advance to, so do nothing */
|
||
|
if (abd_iter_at_end(aiter))
|
||
|
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
|
||
|
}
|
||
|
|
||
|
#if defined(_KERNEL)
|
||
|
+/*
|
||
|
+ * Yield the next page struct and data offset and size within it, without
|
||
|
+ * mapping it into the address space.
|
||
|
+ */
|
||
|
+void
|
||
|
+abd_iter_page(struct abd_iter *aiter)
|
||
|
+{
|
||
|
+ if (abd_iter_at_end(aiter)) {
|
||
|
+ aiter->iter_page = NULL;
|
||
|
+ aiter->iter_page_doff = 0;
|
||
|
+ aiter->iter_page_dsize = 0;
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
+ struct page *page;
|
||
|
+ size_t doff, dsize;
|
||
|
+
|
||
|
+ if (abd_is_linear(aiter->iter_abd)) {
|
||
|
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||
|
+
|
||
|
+ /* memory address at iter_pos */
|
||
|
+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||
|
+
|
||
|
+ /* struct page for address */
|
||
|
+ page = is_vmalloc_addr(paddr) ?
|
||
|
+ vmalloc_to_page(paddr) : virt_to_page(paddr);
|
||
|
+
|
||
|
+ /* offset of address within the page */
|
||
|
+ doff = offset_in_page(paddr);
|
||
|
+
|
||
|
+ /* total data remaining in abd from this position */
|
||
|
+ dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||
|
+ } else {
|
||
|
+ ASSERT(!abd_is_gang(aiter->iter_abd));
|
||
|
+
|
||
|
+ /* current scatter page */
|
||
|
+ page = sg_page(aiter->iter_sg);
|
||
|
+
|
||
|
+ /* position within page */
|
||
|
+ doff = aiter->iter_offset;
|
||
|
+
|
||
|
+ /* remaining data in scatterlist */
|
||
|
+ dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||
|
+ aiter->iter_abd->abd_size - aiter->iter_pos);
|
||
|
+ }
|
||
|
+ ASSERT(page);
|
||
|
+
|
||
|
+ if (PageTail(page)) {
|
||
|
+ /*
|
||
|
+ * This page is part of a "compound page", which is a group of
|
||
|
+ * pages that can be referenced from a single struct page *.
|
||
|
+ * Its organised as a "head" page, followed by a series of
|
||
|
+ * "tail" pages.
|
||
|
+ *
|
||
|
+ * In OpenZFS, compound pages are allocated using the
|
||
|
+ * __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||
|
+ * vmalloc slabs (ie >16K allocations). So a great many of the
|
||
|
+ * IO buffers we get are going to be of this type.
|
||
|
+ *
|
||
|
+ * The tail pages are just regular PAGE_SIZE pages, and can be
|
||
|
+ * safely used as-is. However, the head page has length
|
||
|
+ * covering itself and all the tail pages. If this ABD chunk
|
||
|
+ * spans multiple pages, then we can use the head page and a
|
||
|
+ * >PAGE_SIZE length, which is far more efficient.
|
||
|
+ *
|
||
|
+ * To do this, we need to adjust the offset to be counted from
|
||
|
+ * the head page. struct page for compound pages are stored
|
||
|
+ * contiguously, so we can just adjust by a simple offset.
|
||
|
+ */
|
||
|
+ struct page *head = compound_head(page);
|
||
|
+ doff += ((page - head) * PAGESIZE);
|
||
|
+ page = head;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* final page and position within it */
|
||
|
+ aiter->iter_page = page;
|
||
|
+ aiter->iter_page_doff = doff;
|
||
|
+
|
||
|
+ /* amount of data in the chunk, up to the end of the page */
|
||
|
+ aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||
|
+}
|
||
|
+
|
||
|
/*
|
||
|
* bio_nr_pages for ABD.
|
||
|
* @off is the offset in @abd
|
||
|
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
||
|
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||
|
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||
|
"Maximum order allocation used for a scatter ABD.");
|
||
|
-#endif
|
||
|
+
|
||
|
+#endif /* _KERNEL */
|
||
|
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
|
||
|
index d982f201c..3388e2357 100644
|
||
|
--- a/module/zfs/abd.c
|
||
|
+++ b/module/zfs/abd.c
|
||
|
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
||
|
return (ret);
|
||
|
}
|
||
|
|
||
|
+#if defined(__linux__) && defined(_KERNEL)
|
||
|
+int
|
||
|
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
||
|
+ abd_iter_page_func_t *func, void *private)
|
||
|
+{
|
||
|
+ struct abd_iter aiter;
|
||
|
+ int ret = 0;
|
||
|
+
|
||
|
+ if (size == 0)
|
||
|
+ return (0);
|
||
|
+
|
||
|
+ abd_verify(abd);
|
||
|
+ ASSERT3U(off + size, <=, abd->abd_size);
|
||
|
+
|
||
|
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||
|
+
|
||
|
+ while (size > 0) {
|
||
|
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
|
||
|
+
|
||
|
+ abd_iter_page(&aiter);
|
||
|
+
|
||
|
+ size_t len = MIN(aiter.iter_page_dsize, size);
|
||
|
+ ASSERT3U(len, >, 0);
|
||
|
+
|
||
|
+ ret = func(aiter.iter_page, aiter.iter_page_doff,
|
||
|
+ len, private);
|
||
|
+
|
||
|
+ aiter.iter_page = NULL;
|
||
|
+ aiter.iter_page_doff = 0;
|
||
|
+ aiter.iter_page_dsize = 0;
|
||
|
+
|
||
|
+ if (ret != 0)
|
||
|
+ break;
|
||
|
+
|
||
|
+ size -= len;
|
||
|
+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
||
|
+ }
|
||
|
+
|
||
|
+ return (ret);
|
||
|
+}
|
||
|
+#endif
|
||
|
+
|
||
|
struct buf_arg {
|
||
|
void *arg_buf;
|
||
|
};
|