From 28430b51e3e2387e6f36d5b4ee5b30ef33095993 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 21 Jul 2023 14:50:48 -0400 Subject: [PATCH] Add explicit prefetches to bpobj_iterate(). To simplify error handling bpobj_iterate_blkptrs() iterates through the list of block pointers backwards. Unfortunately speculative prefetcher is currently unable to detect such patterns, that makes each block read there synchronous and very slow on HDD pools. According to my tests, added explicit prefetch reduces time needed to asynchronously delete 8 snapshots of 4 million blocks each from 20 seconds to less than one, that should free sync thread for other useful work, such as async writes, scrub, etc. While there, plug one memory leak in case of bpobj_open() error and harmonize some variable names. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15071 --- include/sys/bpobj.h | 2 +- module/zfs/bpobj.c | 49 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index f3384f526..81bc0fe21 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -60,7 +60,7 @@ typedef struct bpobj { kmutex_t bpo_lock; objset_t *bpo_os; uint64_t bpo_object; - int bpo_epb; + uint32_t bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; uint8_t bpo_havefreed; diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 211bab565..e772caead 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { + int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; + uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * + sizeof (blkptr_t); + uint64_t ps = start * sizeof (blkptr_t); + uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, + ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, + ZIO_PRIORITY_ASYNC_READ); + } + for (; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); @@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, if (dbuf) dmu_buf_rele(dbuf, FTAG); err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, FTAG, &dbuf, 0); + offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); if (err) break; + pe = pb; + pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? + dbuf->db_offset - dmu_prefetch_max : 0, ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); + } } ASSERT3U(offset, >=, dbuf->db_offset); @@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int64_t i = bpi->bpi_unprocessed_subobjs - 1; uint64_t offset = i * sizeof (uint64_t); - uint64_t obj_from_sublist; + uint64_t subobj; err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - offset, sizeof (uint64_t), &obj_from_sublist, - DMU_READ_PREFETCH); + offset, sizeof (uint64_t), &subobj, + DMU_READ_NO_PREFETCH); if (err) break; - bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + + bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), KM_SLEEP); - - err = bpobj_open(sublist, bpo->bpo_os, - obj_from_sublist); - if (err) + err = bpobj_open(subbpo, bpo->bpo_os, subobj); + if (err) { + kmem_free(subbpo, sizeof (bpobj_t)); break; + } - list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); - mutex_enter(&sublist->bpo_lock); + if (subbpo->bpo_havesubobj && + subbpo->bpo_phys->bpo_subobjs != 0) { + dmu_prefetch(subbpo->bpo_os, + subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, + ZIO_PRIORITY_ASYNC_READ); + } + + list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); + mutex_enter(&subbpo->bpo_lock); bpi->bpi_unprocessed_subobjs--; } }