Stack overflow in recursive bpobj_iterate_impl

The function bpobj_iterate_impl overflows the stack when bpobjs
are deeply nested. Rewrite the function to eliminate the recursion.

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Zuchowski <pzuchowski@datto.com>
Closes #7674
Closes #7675 
Closes #7908
This commit is contained in:
Paul Zuchowski 2019-03-06 12:50:55 -05:00 committed by Brian Behlendorf
parent 96ebc5a1a4
commit a73e8fdb93
3 changed files with 248 additions and 105 deletions

View File

@ -740,6 +740,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
*/
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);

View File

@ -206,29 +206,73 @@ bpobj_is_empty(bpobj_t *bpo)
(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
}
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
/*
* A recursive iteration of the bpobjs would be nice here but we run the risk
* of overflowing function stack space. Instead, find each subobj and add it
* to the head of our list so it can be scanned for subjobjs. Like a
* recursive implementation, the "deepest" subobjs will be freed first.
* When a subobj is found to have no additional subojs, free it.
*/
typedef struct bpobj_info {
bpobj_t *bpi_bpo;
/*
* This object is a subobj of bpi_parent,
* at bpi_index in its subobj array.
*/
struct bpobj_info *bpi_parent;
uint64_t bpi_index;
/* How many of our subobj's are left to process. */
uint64_t bpi_unprocessed_subobjs;
/* True after having visited this bpo's directly referenced BPs. */
boolean_t bpi_visited;
list_node_t bpi_node;
} bpobj_info_t;
static bpobj_info_t *
bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
{
bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
bpi->bpi_bpo = bpo;
bpi->bpi_parent = parent;
bpi->bpi_index = index;
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
}
return (bpi);
}
/*
* Update bpobj and all of its parents with new space accounting.
*/
static void
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
{
for (; bpi != NULL; bpi = bpi->bpi_parent) {
bpobj_t *p = bpi->bpi_bpo;
ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
p->bpo_phys->bpo_bytes -= freed;
ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
if (p->bpo_havecomp) {
p->bpo_phys->bpo_comp -= comp_freed;
p->bpo_phys->bpo_uncomp -= uncomp_freed;
}
}
}
static int
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
{
dmu_object_info_t doi;
int epb;
int64_t i;
int err = 0;
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
ASSERT(bpobj_is_open(bpo));
mutex_enter(&bpo->bpo_lock);
if (free)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
blkptr_t *bparray;
blkptr_t *bp;
uint64_t offset, blkoff;
offset = i * sizeof (blkptr_t);
blkoff = P2PHASE(i, bpo->bpo_epb);
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
@ -242,119 +286,199 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
bparray = dbuf->db_data;
bp = &bparray[blkoff];
blkptr_t *bparray = dbuf->db_data;
blkptr_t *bp = &bparray[blkoff];
err = func(arg, bp, tx);
if (err)
break;
if (free) {
bpo->bpo_phys->bpo_bytes -=
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
if (bpo->bpo_havecomp) {
bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
}
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
freed += bp_get_dsize_sync(spa, bp);
comp_freed += BP_GET_PSIZE(bp);
uncomp_freed += BP_GET_UCSIZE(bp);
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
}
}
if (free) {
propagate_space_reduction(bpi, freed, comp_freed,
uncomp_freed, tx);
VERIFY0(dmu_free_range(bpo->bpo_os,
bpo->bpo_object,
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
DMU_OBJECT_END, tx));
}
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
if (free) {
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
(i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
}
if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
goto out;
ASSERT(bpo->bpo_havecomp);
err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
if (err) {
mutex_exit(&bpo->bpo_lock);
return (err);
}
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
epb = doi.doi_data_block_size / sizeof (uint64_t);
for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
uint64_t *objarray;
uint64_t offset, blkoff;
bpobj_t sublist;
uint64_t used_before, comp_before, uncomp_before;
uint64_t used_after, comp_after, uncomp_after;
/*
* Given an initial bpo, start by freeing the BPs that are directly referenced
* by that bpo. If the bpo has subobjs, read in its last subobj and push the
* subobj to our stack. By popping items off our stack, eventually we will
* encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
* requested also free the now-empty bpo from disk and decrement
* its parent's subobj count. We continue popping each subobj from our stack,
* visiting its last subobj until they too have no more subobjs, and so on.
*/
static int
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
{
list_t stack;
bpobj_info_t *bpi;
int err = 0;
offset = i * sizeof (uint64_t);
blkoff = P2PHASE(i, epb);
/*
* Create a "stack" for us to work with without worrying about
* stack overflows. Initialize it with the initial_bpo.
*/
list_create(&stack, sizeof (bpobj_info_t),
offsetof(bpobj_info_t, bpi_node));
mutex_enter(&initial_bpo->bpo_lock);
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
if (err)
while ((bpi = list_head(&stack)) != NULL) {
bpobj_t *bpo = bpi->bpi_bpo;
ASSERT3P(bpo, !=, NULL);
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
ASSERT(bpobj_is_open(bpo));
if (free)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpi->bpi_visited == B_FALSE) {
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
bpi->bpi_visited = B_TRUE;
if (err != 0)
break;
}
ASSERT3U(offset, >=, dbuf->db_offset);
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
objarray = dbuf->db_data;
err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
if (err)
break;
if (free) {
err = bpobj_space(&sublist,
&used_before, &comp_before, &uncomp_before);
if (err != 0) {
bpobj_close(&sublist);
break;
}
}
err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
if (free) {
VERIFY3U(0, ==, bpobj_space(&sublist,
&used_after, &comp_after, &uncomp_after));
bpo->bpo_phys->bpo_bytes -= used_before - used_after;
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
bpo->bpo_phys->bpo_uncomp -=
uncomp_before - uncomp_after;
}
bpobj_close(&sublist);
if (err)
break;
if (free) {
err = dmu_object_free(bpo->bpo_os,
objarray[blkoff], tx);
if (err)
break;
bpo->bpo_phys->bpo_num_subobjs--;
ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
}
}
if (dbuf) {
dmu_buf_rele(dbuf, FTAG);
dbuf = NULL;
}
if (free) {
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs,
(i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
}
out:
/* If there are no entries, there should be no bytes. */
/*
* We've finished with this bpo's directly-referenced BP's and
* it has no more unprocessed subobjs. We can free its
* bpobj_info_t (unless it is the topmost, initial_bpo).
* If we are freeing from disk, we can also do that.
*/
if (bpi->bpi_unprocessed_subobjs == 0) {
/*
* If there are no entries, there should
* be no bytes.
*/
if (bpobj_is_empty(bpo)) {
ASSERT0(bpo->bpo_phys->bpo_bytes);
ASSERT0(bpo->bpo_phys->bpo_comp);
ASSERT0(bpo->bpo_phys->bpo_uncomp);
}
/* The initial_bpo has no parent and is not closed. */
if (bpi->bpi_parent != NULL) {
if (free) {
bpobj_t *p = bpi->bpi_parent->bpi_bpo;
ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
ASSERT3U(p->bpo_phys->bpo_num_subobjs,
>, 0);
ASSERT3U(bpi->bpi_index, ==,
p->bpo_phys->bpo_num_subobjs - 1);
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
tx));
p->bpo_phys->bpo_num_subobjs--;
VERIFY0(dmu_free_range(p->bpo_os,
p->bpo_phys->bpo_subobjs,
bpi->bpi_index * sizeof (uint64_t),
sizeof (uint64_t), tx));
/* eliminate the empty subobj list */
if (bpo->bpo_havesubobj &&
bpo->bpo_phys->bpo_subobjs != 0) {
ASSERT0(bpo->bpo_phys->
bpo_num_subobjs);
err = dmu_object_free(
bpo->bpo_os,
bpo->bpo_phys->bpo_subobjs,
tx);
if (err)
break;
bpo->bpo_phys->bpo_subobjs = 0;
}
err = dmu_object_free(p->bpo_os,
bpo->bpo_object, tx);
if (err)
break;
}
mutex_exit(&bpo->bpo_lock);
bpobj_close(bpo);
kmem_free(bpo, sizeof (bpobj_t));
} else {
mutex_exit(&bpo->bpo_lock);
}
/*
* Finished processing this bpo. Unlock, and free
* our "stack" info.
*/
list_remove_head(&stack);
kmem_free(bpi, sizeof (bpobj_info_t));
} else {
/*
* We have unprocessed subobjs. Process the next one.
*/
ASSERT(bpo->bpo_havecomp);
/* Add the last subobj to stack. */
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
uint64_t offset = i * sizeof (uint64_t);
uint64_t obj_from_sublist;
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
offset, sizeof (uint64_t), &obj_from_sublist,
DMU_READ_PREFETCH);
if (err)
break;
bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
KM_SLEEP);
err = bpobj_open(sublist, bpo->bpo_os,
obj_from_sublist);
if (err)
break;
list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
mutex_enter(&sublist->bpo_lock);
bpi->bpi_unprocessed_subobjs--;
}
}
/*
* Cleanup anything left on the "stack" after we left the loop.
* Every bpo on the stack is locked so we must remember to undo
* that now (in LIFO order).
*/
while ((bpi = list_remove_head(&stack)) != NULL) {
bpobj_t *bpo = bpi->bpi_bpo;
ASSERT(err != 0);
ASSERT3P(bpo, !=, NULL);
mutex_exit(&bpo->bpo_lock);
/* do not free the initial_bpo */
if (bpi->bpi_parent != NULL) {
bpobj_close(bpi->bpi_bpo);
kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
}
kmem_free(bpi, sizeof (bpobj_info_t));
}
list_destroy(&stack);
return (err);
}

View File

@ -2311,6 +2311,23 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
}
boolean_t
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
mutex_enter(&db->db_mtx);
for (dbuf_dirty_record_t *dr = db->db_last_dirty;
dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
if (dr->dr_txg == tx->tx_txg) {
mutex_exit(&db->db_mtx);
return (B_TRUE);
}
}
mutex_exit(&db->db_mtx);
return (B_FALSE);
}
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@ -4564,6 +4581,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);