From 390b448726c580999dd337be7a40b0e95cf1d50b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 11 Dec 2023 16:05:54 +1100 Subject: [PATCH] abd: add page iterator The regular ABD iterators yield data buffers, so they have to map and unmap pages into kernel memory. If the caller only wants to count chunks, or can use page pointers directly, then the map/unmap is just unnecessary overhead. This adds adb_iterate_page_func, which yields unmapped struct page instead. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15533 Closes #15588 --- include/sys/abd.h | 7 +++ include/sys/abd_impl.h | 26 ++++++++- module/os/freebsd/zfs/abd_os.c | 4 +- module/os/linux/zfs/abd_os.c | 104 ++++++++++++++++++++++++++++++--- module/zfs/abd.c | 42 +++++++++++++ 5 files changed, 169 insertions(+), 14 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index b48dc3642..3a500e2c9 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 40546d4af..f88ea25e2 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #ifndef _ABD_IMPL_H @@ -38,12 +39,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 58a37df62..3b812271f 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; } /* diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 24390fbbf..dae128012 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ #include #ifdef _KERNEL #include +#include #include #endif @@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { + /* + * Ensure that last chunk is not in use. abd_iterate_*() must clear + * this state (directly or abd_iter_unmap()) before advancing. + */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); + ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0(aiter->iter_page_doff); + ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) @@ -1009,6 +1014,88 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) +/* + * Yield the next page struct and data offset and size within it, without + * mapping it into the address space. + */ +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + + /* total data remaining in abd from this position */ + dsize = aiter->iter_abd->abd_size - aiter->iter_offset; + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = sg_page(aiter->iter_sg); + + /* position within page */ + doff = aiter->iter_offset; + + /* remaining data in scatterlist */ + dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + } + ASSERT(page); + + if (PageTail(page)) { + /* + * This page is part of a "compound page", which is a group of + * pages that can be referenced from a single struct page *. + * Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the + * __GFP_COMP flag, which we get from scatter ABDs and SPL + * vmalloc slabs (ie >16K allocations). So a great many of the + * IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGE_SIZE pages, and can be + * safely used as-is. However, the head page has length + * covering itself and all the tail pages. If this ABD chunk + * spans multiple pages, then we can use the head page and a + * >PAGE_SIZE length, which is far more efficient. + * + * To do this, we need to adjust the offset to be counted from + * the head page. struct page for compound pages are stored + * contiguously, so we can just adjust by a simple offset. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } + + /* final page and position within it */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + + /* amount of data in the chunk, up to the end of the page */ + aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); +} + /* * bio_nr_pages for ABD. * @off is the offset in @abd @@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 0a2411a2d..2c0cda25d 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + aiter.iter_page = NULL; + aiter.iter_page_doff = 0; + aiter.iter_page_dsize = 0; + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; };