From a73e8fdb93d24b885f0c38202a34da51013d674a Mon Sep 17 00:00:00 2001 From: Paul Zuchowski <31706010+PaulZ-98@users.noreply.github.com> Date: Wed, 6 Mar 2019 12:50:55 -0500 Subject: [PATCH] Stack overflow in recursive bpobj_iterate_impl The function bpobj_iterate_impl overflows the stack when bpobjs are deeply nested. Rewrite the function to eliminate the recursion. Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Paul Zuchowski Closes #7674 Closes #7675 Closes #7908 --- include/sys/dmu.h | 1 + module/zfs/bpobj.c | 334 +++++++++++++++++++++++++++++++-------------- module/zfs/dbuf.c | 18 +++ 3 files changed, 248 insertions(+), 105 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 63c51ecfb..e4c2ebc2f 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -740,6 +740,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 94bd1fd8f..633801956 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -206,29 +206,73 @@ bpobj_is_empty(bpobj_t *bpo) (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); } -static int -bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, - boolean_t free) +/* + * A recursive iteration of the bpobjs would be nice here but we run the risk + * of overflowing function stack space. Instead, find each subobj and add it + * to the head of our list so it can be scanned for subjobjs. Like a + * recursive implementation, the "deepest" subobjs will be freed first. + * When a subobj is found to have no additional subojs, free it. + */ +typedef struct bpobj_info { + bpobj_t *bpi_bpo; + /* + * This object is a subobj of bpi_parent, + * at bpi_index in its subobj array. + */ + struct bpobj_info *bpi_parent; + uint64_t bpi_index; + /* How many of our subobj's are left to process. */ + uint64_t bpi_unprocessed_subobjs; + /* True after having visited this bpo's directly referenced BPs. */ + boolean_t bpi_visited; + list_node_t bpi_node; +} bpobj_info_t; + +static bpobj_info_t * +bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) +{ + bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); + bpi->bpi_bpo = bpo; + bpi->bpi_parent = parent; + bpi->bpi_index = index; + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; + } + return (bpi); +} + +/* + * Update bpobj and all of its parents with new space accounting. + */ +static void +propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed, + uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx) +{ + + for (; bpi != NULL; bpi = bpi->bpi_parent) { + bpobj_t *p = bpi->bpi_bpo; + ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); + p->bpo_phys->bpo_bytes -= freed; + ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); + if (p->bpo_havecomp) { + p->bpo_phys->bpo_comp -= comp_freed; + p->bpo_phys->bpo_uncomp -= uncomp_freed; + } + } +} + +static int +bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, + dmu_tx_t *tx, boolean_t free) { - dmu_object_info_t doi; - int epb; - int64_t i; int err = 0; + uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; + bpobj_t *bpo = bpi->bpi_bpo; - ASSERT(bpobj_is_open(bpo)); - mutex_enter(&bpo->bpo_lock); - - if (free) - dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - - for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { - blkptr_t *bparray; - blkptr_t *bp; - uint64_t offset, blkoff; - - offset = i * sizeof (blkptr_t); - blkoff = P2PHASE(i, bpo->bpo_epb); + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + uint64_t offset = i * sizeof (blkptr_t); + uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) @@ -242,119 +286,199 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - bparray = dbuf->db_data; - bp = &bparray[blkoff]; + blkptr_t *bparray = dbuf->db_data; + blkptr_t *bp = &bparray[blkoff]; err = func(arg, bp, tx); if (err) break; + if (free) { - bpo->bpo_phys->bpo_bytes -= - bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - if (bpo->bpo_havecomp) { - bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); - bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); - } + spa_t *spa = dmu_objset_spa(bpo->bpo_os); + freed += bp_get_dsize_sync(spa, bp); + comp_freed += BP_GET_PSIZE(bp); + uncomp_freed += BP_GET_UCSIZE(bp); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); } } + if (free) { + propagate_space_reduction(bpi, freed, comp_freed, + uncomp_freed, tx); + VERIFY0(dmu_free_range(bpo->bpo_os, + bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + DMU_OBJECT_END, tx)); + } if (dbuf) { dmu_buf_rele(dbuf, FTAG); dbuf = NULL; } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, - (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx)); - } - if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) - goto out; + return (err); +} - ASSERT(bpo->bpo_havecomp); - err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); - if (err) { - mutex_exit(&bpo->bpo_lock); - return (err); - } - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - epb = doi.doi_data_block_size / sizeof (uint64_t); +/* + * Given an initial bpo, start by freeing the BPs that are directly referenced + * by that bpo. If the bpo has subobjs, read in its last subobj and push the + * subobj to our stack. By popping items off our stack, eventually we will + * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if + * requested also free the now-empty bpo from disk and decrement + * its parent's subobj count. We continue popping each subobj from our stack, + * visiting its last subobj until they too have no more subobjs, and so on. + */ +static int +bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, + dmu_tx_t *tx, boolean_t free) +{ + list_t stack; + bpobj_info_t *bpi; + int err = 0; - for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { - uint64_t *objarray; - uint64_t offset, blkoff; - bpobj_t sublist; - uint64_t used_before, comp_before, uncomp_before; - uint64_t used_after, comp_after, uncomp_after; + /* + * Create a "stack" for us to work with without worrying about + * stack overflows. Initialize it with the initial_bpo. + */ + list_create(&stack, sizeof (bpobj_info_t), + offsetof(bpobj_info_t, bpi_node)); + mutex_enter(&initial_bpo->bpo_lock); + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); - offset = i * sizeof (uint64_t); - blkoff = P2PHASE(i, epb); + while ((bpi = list_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; - if (dbuf == NULL || dbuf->db_offset > offset) { - if (dbuf) - dmu_buf_rele(dbuf, FTAG); - err = dmu_buf_hold(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); - if (err) + ASSERT3P(bpo, !=, NULL); + ASSERT(MUTEX_HELD(&bpo->bpo_lock)); + ASSERT(bpobj_is_open(bpo)); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + if (bpi->bpi_visited == B_FALSE) { + err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free); + bpi->bpi_visited = B_TRUE; + if (err != 0) break; } - - ASSERT3U(offset, >=, dbuf->db_offset); - ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - - objarray = dbuf->db_data; - err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); - if (err) - break; - if (free) { - err = bpobj_space(&sublist, - &used_before, &comp_before, &uncomp_before); - if (err != 0) { - bpobj_close(&sublist); - break; + /* + * We've finished with this bpo's directly-referenced BP's and + * it has no more unprocessed subobjs. We can free its + * bpobj_info_t (unless it is the topmost, initial_bpo). + * If we are freeing from disk, we can also do that. + */ + if (bpi->bpi_unprocessed_subobjs == 0) { + /* + * If there are no entries, there should + * be no bytes. + */ + if (bpobj_is_empty(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); } - } - err = bpobj_iterate_impl(&sublist, func, arg, tx, free); - if (free) { - VERIFY3U(0, ==, bpobj_space(&sublist, - &used_after, &comp_after, &uncomp_after)); - bpo->bpo_phys->bpo_bytes -= used_before - used_after; - ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); - bpo->bpo_phys->bpo_comp -= comp_before - comp_after; - bpo->bpo_phys->bpo_uncomp -= - uncomp_before - uncomp_after; - } - bpobj_close(&sublist); - if (err) - break; - if (free) { - err = dmu_object_free(bpo->bpo_os, - objarray[blkoff], tx); + /* The initial_bpo has no parent and is not closed. */ + if (bpi->bpi_parent != NULL) { + if (free) { + bpobj_t *p = bpi->bpi_parent->bpi_bpo; + + ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); + ASSERT3U(p->bpo_phys->bpo_num_subobjs, + >, 0); + ASSERT3U(bpi->bpi_index, ==, + p->bpo_phys->bpo_num_subobjs - 1); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, + tx)); + + p->bpo_phys->bpo_num_subobjs--; + + VERIFY0(dmu_free_range(p->bpo_os, + p->bpo_phys->bpo_subobjs, + bpi->bpi_index * sizeof (uint64_t), + sizeof (uint64_t), tx)); + + /* eliminate the empty subobj list */ + if (bpo->bpo_havesubobj && + bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(bpo->bpo_phys-> + bpo_num_subobjs); + err = dmu_object_free( + bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + tx); + if (err) + break; + bpo->bpo_phys->bpo_subobjs = 0; + } + err = dmu_object_free(p->bpo_os, + bpo->bpo_object, tx); + if (err) + break; + } + + mutex_exit(&bpo->bpo_lock); + bpobj_close(bpo); + kmem_free(bpo, sizeof (bpobj_t)); + } else { + mutex_exit(&bpo->bpo_lock); + } + + /* + * Finished processing this bpo. Unlock, and free + * our "stack" info. + */ + list_remove_head(&stack); + kmem_free(bpi, sizeof (bpobj_info_t)); + } else { + /* + * We have unprocessed subobjs. Process the next one. + */ + ASSERT(bpo->bpo_havecomp); + + /* Add the last subobj to stack. */ + int64_t i = bpi->bpi_unprocessed_subobjs - 1; + uint64_t offset = i * sizeof (uint64_t); + + uint64_t obj_from_sublist; + err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + offset, sizeof (uint64_t), &obj_from_sublist, + DMU_READ_PREFETCH); if (err) break; - bpo->bpo_phys->bpo_num_subobjs--; - ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + KM_SLEEP); + + err = bpobj_open(sublist, bpo->bpo_os, + obj_from_sublist); + if (err) + break; + + list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); + mutex_enter(&sublist->bpo_lock); + bpi->bpi_unprocessed_subobjs--; } } - if (dbuf) { - dmu_buf_rele(dbuf, FTAG); - dbuf = NULL; - } - if (free) { - VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, - bpo->bpo_phys->bpo_subobjs, - (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx)); + /* + * Cleanup anything left on the "stack" after we left the loop. + * Every bpo on the stack is locked so we must remember to undo + * that now (in LIFO order). + */ + while ((bpi = list_remove_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; + ASSERT(err != 0); + ASSERT3P(bpo, !=, NULL); + + mutex_exit(&bpo->bpo_lock); + + /* do not free the initial_bpo */ + if (bpi->bpi_parent != NULL) { + bpobj_close(bpi->bpi_bpo); + kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); + } + kmem_free(bpi, sizeof (bpobj_info_t)); } -out: - /* If there are no entries, there should be no bytes. */ - if (bpobj_is_empty(bpo)) { - ASSERT0(bpo->bpo_phys->bpo_bytes); - ASSERT0(bpo->bpo_phys->bpo_comp); - ASSERT0(bpo->bpo_phys->bpo_uncomp); - } + list_destroy(&stack); - mutex_exit(&bpo->bpo_lock); return (err); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9f3c9bfd5..28ff5fc7e 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2311,6 +2311,23 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); } +boolean_t +dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + for (dbuf_dirty_record_t *dr = db->db_last_dirty; + dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + if (dr->dr_txg == tx->tx_txg) { + mutex_exit(&db->db_mtx); + return (B_TRUE); + } + } + mutex_exit(&db->db_mtx); + return (B_FALSE); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -4564,6 +4581,7 @@ EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done);