diff --git a/include/sys/abd.h b/include/sys/abd.h index 3d9fdbf10..b781be4da 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #ifndef _ABD_H @@ -44,7 +44,8 @@ typedef enum abd_flags { ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ } abd_flags_t; typedef struct abd { @@ -60,6 +61,7 @@ typedef struct abd { } abd_scatter; struct abd_linear { void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ } abd_linear; } abd_u; } abd_t; @@ -75,6 +77,13 @@ abd_is_linear(abd_t *abd) return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); } +static inline boolean_t +abd_is_linear_page(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? + B_TRUE : B_FALSE); +} + /* * Allocations and deallocations */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 32b2c842c..8b2514404 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -72,17 +72,19 @@ * (2) Fragmentation is less of an issue since when we are at the limit of * allocatable space, we won't have to search around for a long free * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we weren't using segkpm (see next point) we + * individually, so even if we are using HIGHMEM (see next point) we * wouldn't need to worry about finding a contiguous address range. * - * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs - * on each ABD access. (If segkpm isn't available then we use all linear - * ABDs to avoid this penalty.) See seg_kpm.c for more details. + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_pages() for details. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to - * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not - * available, which is the case on all 32-bit systems and any 64-bit systems - * where kpm_enable is turned off. + * B_FALSE. * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset @@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size) #define __GFP_RECLAIM __GFP_WAIT #endif -static unsigned long -abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order) -{ - struct page *page; - - page = alloc_pages_node(nid, gfp, order); - if (!page) - return (0); - - return ((unsigned long) page_address(page)); -} - /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is @@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size) size_t remaining_size; int nid = NUMA_NO_NODE; int alloc_pages = 0; - int order; INIT_LIST_HEAD(&pages); while (alloc_pages < nr_pages) { - unsigned long paddr; unsigned chunk_pages; + int order; order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); chunk_pages = (1U << order); - paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order); - if (paddr == 0) { + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { if (order == 0) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); @@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size) continue; } - page = virt_to_page(paddr); list_add_tail(&page->lru, &pages); if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) @@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size) list_del(&page->lru); } - if (chunks > 1) { + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + abd->abd_u.abd_linear.abd_buf = + page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; @@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size) ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); abd->abd_flags |= ABD_FLAG_MULTI_ZONE; } - } - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = table.nents; + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } } #else /* @@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd) struct page; -#define kpm_enable 1 -#define abd_alloc_chunk(o) \ - ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP)) -#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o)) #define zfs_kmap_atomic(chunk, km) ((void *)chunk) #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) #define local_irq_save(flags) do { (void)(flags); } while (0) @@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size) sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = abd_alloc_chunk(0); + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); sg_set_page(sg, p, PAGESIZE, 0); } ABD_SCATTER(abd).abd_nents = nr_pages; @@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd) { int i, n = ABD_SCATTER(abd).abd_nents; struct scatterlist *sg; - int j; abd_for_each_sg(abd, sg, n, i) { - for (j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT); - abd_free_chunk(p, 0); + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); } } @@ -560,7 +577,7 @@ abd_verify(abd_t *abd) ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK)); + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { @@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata) abd_t *abd = abd_alloc_struct(); abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; abd_alloc_pages(abd, size); if (is_metadata) { @@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_scatter.abd_offset = 0; - ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, @@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) static void abd_free_linear(abd_t *abd) { + if (abd_is_linear_page(abd)) { + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_scatter(abd); + return; + } if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); } else { @@ -718,7 +745,8 @@ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); @@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 53a44bdaf..dd382b065 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2917,7 +2917,8 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * allocate a new buffer to store the buf's data. + * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new + * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be @@ -2925,10 +2926,17 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't - * need to be ABD-aware. + * need to be ABD-aware. It must be allocated via + * zio_[data_]buf_alloc(), not as a page, because we need to be able + * to abd_release_ownership_of_buf(), which isn't allowed on "linear + * page" buffers because the ABD code needs to handle freeing them + * specially. */ - boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd); + boolean_t can_share = arc_can_share(hdr, buf) && + !HDR_L2_WRITING(hdr) && + hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(hdr->b_l1hdr.b_pabd) && + !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { @@ -3731,7 +3739,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ - ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); arc_hdr_free_abd(hdr, B_FALSE); arc_share_buf(hdr, buf); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 94eaa5888..f1bf37704 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -345,12 +345,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - /* - * Ensure that anyone expecting this zio to contain a linear ABD isn't - * going to get a nasty surprise when they try to access the data. - */ - IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); - zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize;