From a4862125b887c4b9f6bc3889119ef030da5a4248 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 17 Aug 2021 12:15:54 -0400 Subject: [PATCH] Remove b_pabd/b_rabd allocation from arc_hdr_alloc() When a header is allocated for full overwrite it is a waste of time to allocate b_pabd/b_rabd for it, since arc_write() will free them without ever being touched. If it is a read or a partial overwrite then arc_read() and arc_hdr_decrypt() allocate them explicitly. Reduced memory allocation in user threads also reduces ARC eviction throttling there, proportionally increasing it in ZIO threads, that is not good. To minimize or even avoid it introduce ARC allocation reserve, allowing certain arc_get_data_abd() callers to allocate a bit longer in situations where user threads will already throttle. Reviewed-by: George Wilson Reviewed-by: Mark Maybee Signed-off-by: Alexander Motin Closes #12398 --- include/sys/arc_impl.h | 2 +- module/os/freebsd/zfs/arc_os.c | 2 +- module/os/linux/zfs/arc_os.c | 2 +- module/zfs/arc.c | 113 +++++++++++++++++++-------------- 4 files changed, 68 insertions(+), 51 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index a38083714..43818d410 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -989,7 +989,7 @@ extern unsigned long zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); -extern void arc_wait_for_eviction(uint64_t); +extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 3b8b11cff..fddb1f0e8 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -233,7 +233,7 @@ arc_lowmem(void *arg __unused, int howto __unused) * with ARC reclaim thread. */ if (curproc == pageproc) - arc_wait_for_eviction(to_free); + arc_wait_for_eviction(to_free, B_FALSE); } void diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index c3e889d54..f96cd1271 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -217,7 +217,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) * for the requested amount of data to be evicted. */ arc_reduce_target_size(ptob(sc->nr_to_scan)); - arc_wait_for_eviction(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) current->reclaim_state->reclaimed_slab += sc->nr_to_scan; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 677b80029..a89a9e65d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -835,12 +835,13 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, + ARC_HDR_USE_RESERVE = 0x4, }; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); @@ -1855,7 +1856,8 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -3170,7 +3172,6 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); - boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3181,14 +3182,14 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, - do_adapt); + alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } @@ -3234,13 +3235,34 @@ arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } +/* + * Allocate empty anonymous ARC header. The header will get its identity + * assigned and buffers attached later as part of read or write operations. + * + * In case of read arc_read() assigns header its identify (b_dva + b_birth), + * inserts it into ARC hash to become globally visible and allocates physical + * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read + * completion arc_read_done() allocates ARC buffer(s) as needed, potentially + * sharing one of them with the physical ABD buffer. + * + * In case of write arc_alloc_buf() allocates ARC buffer to be filled with + * data. Then after compression and/or encryption arc_write_ready() allocates + * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD + * buffer. On disk write completion arc_write_done() assigns the header its + * new identity (b_dva + b_birth) and inserts into ARC hash. + * + * In case of partial overwrite the old data is read first as described. Then + * arc_release() either allocates new anonymous ARC header and moves the ARC + * buffer to it, or reuses the old ARC header by discarding its identity and + * removing it from ARC hash. After buffer modification normal write process + * follows as described. + */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, - arc_buf_contents_t type, boolean_t alloc_rdata) + arc_buf_contents_t type) { arc_buf_hdr_t *hdr; - int flags = ARC_HDR_DO_ADAPT; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { @@ -3248,7 +3270,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } - flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); @@ -3272,12 +3293,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; - /* - * Allocate the hdr's buffer. This will contain either - * the compressed or uncompressed data depending on the block - * it references and compressed arc enablement. - */ - arc_hdr_alloc_abd(hdr, flags); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -3564,7 +3579,7 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); + B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3588,7 +3603,7 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); + B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3596,16 +3611,12 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); - if (!arc_buf_is_shared(buf)) { - /* - * To ensure that the hdr has the correct data in it if we call - * arc_untransform() on this buf before it's been written to - * disk, it's easiest if we just set up sharing between the - * buf and the hdr. - */ - arc_hdr_free_abd(hdr, B_FALSE); - arc_share_buf(hdr, buf); - } + /* + * To ensure that the hdr has the correct data in it if we call + * arc_untransform() on this buf before it's been written to disk, + * it's easiest if we just set up sharing between the buf and the hdr. + */ + arc_share_buf(hdr, buf); return (buf); } @@ -3627,7 +3638,7 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, - compression_type, complevel, type, B_TRUE); + compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -5125,7 +5136,7 @@ arc_adapt(int bytes, arc_state_t *state) * zfs_arc_overflow_shift. */ static arc_ovf_level_t -arc_is_overflowing(void) +arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, @@ -5142,17 +5153,19 @@ arc_is_overflowing(void) */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; + if (!use_reserve) + overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, do_adapt); + arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { @@ -5166,7 +5179,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, B_TRUE); + arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5183,9 +5196,9 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * of ARC behavior and settings. See arc_lowmem_init(). */ void -arc_wait_for_eviction(uint64_t amount) +arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { - switch (arc_is_overflowing()) { + switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: @@ -5262,12 +5275,12 @@ arc_wait_for_eviction(uint64_t amount) */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, - boolean_t do_adapt) + int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - if (do_adapt) + if (alloc_flags & ARC_HDR_DO_ADAPT) arc_adapt(size, state); /* @@ -5283,7 +5296,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ - arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100); + arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, + alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -6093,8 +6107,7 @@ top: arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, - encrypted_read); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); @@ -6108,6 +6121,7 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } + alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6155,9 +6169,9 @@ top: */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); - arc_hdr_alloc_abd(hdr, alloc_flags); } + arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); @@ -6679,7 +6693,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, - compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); + compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -6859,7 +6873,8 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* @@ -6869,17 +6884,19 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, - ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | + ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -8689,7 +8706,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8726,7 +8743,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - B_TRUE); + ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),