mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
Stack overflow in recursive bpobj_iterate_impl
The function bpobj_iterate_impl overflows the stack when bpobjs are deeply nested. Rewrite the function to eliminate the recursion. Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Zuchowski <pzuchowski@datto.com> Closes #7674 Closes #7675 Closes #7908
This commit is contained in:
parent
96ebc5a1a4
commit
a73e8fdb93
@ -740,6 +740,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
|
||||
* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
|
||||
*/
|
||||
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
|
||||
boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
|
||||
void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
|
||||
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
|
||||
|
||||
|
@ -206,29 +206,73 @@ bpobj_is_empty(bpobj_t *bpo)
|
||||
(!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
|
||||
}
|
||||
|
||||
static int
|
||||
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
|
||||
boolean_t free)
|
||||
/*
|
||||
* A recursive iteration of the bpobjs would be nice here but we run the risk
|
||||
* of overflowing function stack space. Instead, find each subobj and add it
|
||||
* to the head of our list so it can be scanned for subjobjs. Like a
|
||||
* recursive implementation, the "deepest" subobjs will be freed first.
|
||||
* When a subobj is found to have no additional subojs, free it.
|
||||
*/
|
||||
typedef struct bpobj_info {
|
||||
bpobj_t *bpi_bpo;
|
||||
/*
|
||||
* This object is a subobj of bpi_parent,
|
||||
* at bpi_index in its subobj array.
|
||||
*/
|
||||
struct bpobj_info *bpi_parent;
|
||||
uint64_t bpi_index;
|
||||
/* How many of our subobj's are left to process. */
|
||||
uint64_t bpi_unprocessed_subobjs;
|
||||
/* True after having visited this bpo's directly referenced BPs. */
|
||||
boolean_t bpi_visited;
|
||||
list_node_t bpi_node;
|
||||
} bpobj_info_t;
|
||||
|
||||
static bpobj_info_t *
|
||||
bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
|
||||
{
|
||||
bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
|
||||
bpi->bpi_bpo = bpo;
|
||||
bpi->bpi_parent = parent;
|
||||
bpi->bpi_index = index;
|
||||
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
|
||||
bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
|
||||
}
|
||||
return (bpi);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update bpobj and all of its parents with new space accounting.
|
||||
*/
|
||||
static void
|
||||
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
|
||||
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
|
||||
{
|
||||
|
||||
for (; bpi != NULL; bpi = bpi->bpi_parent) {
|
||||
bpobj_t *p = bpi->bpi_bpo;
|
||||
ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
|
||||
p->bpo_phys->bpo_bytes -= freed;
|
||||
ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
|
||||
if (p->bpo_havecomp) {
|
||||
p->bpo_phys->bpo_comp -= comp_freed;
|
||||
p->bpo_phys->bpo_uncomp -= uncomp_freed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
{
|
||||
dmu_object_info_t doi;
|
||||
int epb;
|
||||
int64_t i;
|
||||
int err = 0;
|
||||
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
|
||||
dmu_buf_t *dbuf = NULL;
|
||||
bpobj_t *bpo = bpi->bpi_bpo;
|
||||
|
||||
ASSERT(bpobj_is_open(bpo));
|
||||
mutex_enter(&bpo->bpo_lock);
|
||||
|
||||
if (free)
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
|
||||
for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
|
||||
blkptr_t *bparray;
|
||||
blkptr_t *bp;
|
||||
uint64_t offset, blkoff;
|
||||
|
||||
offset = i * sizeof (blkptr_t);
|
||||
blkoff = P2PHASE(i, bpo->bpo_epb);
|
||||
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
|
||||
uint64_t offset = i * sizeof (blkptr_t);
|
||||
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
|
||||
|
||||
if (dbuf == NULL || dbuf->db_offset > offset) {
|
||||
if (dbuf)
|
||||
@ -242,119 +286,199 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
|
||||
ASSERT3U(offset, >=, dbuf->db_offset);
|
||||
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
|
||||
|
||||
bparray = dbuf->db_data;
|
||||
bp = &bparray[blkoff];
|
||||
blkptr_t *bparray = dbuf->db_data;
|
||||
blkptr_t *bp = &bparray[blkoff];
|
||||
err = func(arg, bp, tx);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
if (free) {
|
||||
bpo->bpo_phys->bpo_bytes -=
|
||||
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
|
||||
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
|
||||
if (bpo->bpo_havecomp) {
|
||||
bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
|
||||
bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
|
||||
}
|
||||
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
|
||||
freed += bp_get_dsize_sync(spa, bp);
|
||||
comp_freed += BP_GET_PSIZE(bp);
|
||||
uncomp_freed += BP_GET_UCSIZE(bp);
|
||||
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
|
||||
bpo->bpo_phys->bpo_num_blkptrs--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
|
||||
}
|
||||
}
|
||||
if (free) {
|
||||
propagate_space_reduction(bpi, freed, comp_freed,
|
||||
uncomp_freed, tx);
|
||||
VERIFY0(dmu_free_range(bpo->bpo_os,
|
||||
bpo->bpo_object,
|
||||
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
|
||||
DMU_OBJECT_END, tx));
|
||||
}
|
||||
if (dbuf) {
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
dbuf = NULL;
|
||||
}
|
||||
if (free) {
|
||||
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
|
||||
(i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
|
||||
}
|
||||
if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
|
||||
goto out;
|
||||
return (err);
|
||||
}
|
||||
|
||||
ASSERT(bpo->bpo_havecomp);
|
||||
err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
|
||||
if (err) {
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
return (err);
|
||||
}
|
||||
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
|
||||
epb = doi.doi_data_block_size / sizeof (uint64_t);
|
||||
/*
|
||||
* Given an initial bpo, start by freeing the BPs that are directly referenced
|
||||
* by that bpo. If the bpo has subobjs, read in its last subobj and push the
|
||||
* subobj to our stack. By popping items off our stack, eventually we will
|
||||
* encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if
|
||||
* requested also free the now-empty bpo from disk and decrement
|
||||
* its parent's subobj count. We continue popping each subobj from our stack,
|
||||
* visiting its last subobj until they too have no more subobjs, and so on.
|
||||
*/
|
||||
static int
|
||||
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
|
||||
dmu_tx_t *tx, boolean_t free)
|
||||
{
|
||||
list_t stack;
|
||||
bpobj_info_t *bpi;
|
||||
int err = 0;
|
||||
|
||||
for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
|
||||
uint64_t *objarray;
|
||||
uint64_t offset, blkoff;
|
||||
bpobj_t sublist;
|
||||
uint64_t used_before, comp_before, uncomp_before;
|
||||
uint64_t used_after, comp_after, uncomp_after;
|
||||
/*
|
||||
* Create a "stack" for us to work with without worrying about
|
||||
* stack overflows. Initialize it with the initial_bpo.
|
||||
*/
|
||||
list_create(&stack, sizeof (bpobj_info_t),
|
||||
offsetof(bpobj_info_t, bpi_node));
|
||||
mutex_enter(&initial_bpo->bpo_lock);
|
||||
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
|
||||
|
||||
offset = i * sizeof (uint64_t);
|
||||
blkoff = P2PHASE(i, epb);
|
||||
while ((bpi = list_head(&stack)) != NULL) {
|
||||
bpobj_t *bpo = bpi->bpi_bpo;
|
||||
|
||||
if (dbuf == NULL || dbuf->db_offset > offset) {
|
||||
if (dbuf)
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
err = dmu_buf_hold(bpo->bpo_os,
|
||||
bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
|
||||
if (err)
|
||||
ASSERT3P(bpo, !=, NULL);
|
||||
ASSERT(MUTEX_HELD(&bpo->bpo_lock));
|
||||
ASSERT(bpobj_is_open(bpo));
|
||||
|
||||
if (free)
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
|
||||
if (bpi->bpi_visited == B_FALSE) {
|
||||
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
|
||||
bpi->bpi_visited = B_TRUE;
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
|
||||
ASSERT3U(offset, >=, dbuf->db_offset);
|
||||
ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
|
||||
|
||||
objarray = dbuf->db_data;
|
||||
err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
|
||||
if (err)
|
||||
break;
|
||||
if (free) {
|
||||
err = bpobj_space(&sublist,
|
||||
&used_before, &comp_before, &uncomp_before);
|
||||
if (err != 0) {
|
||||
bpobj_close(&sublist);
|
||||
break;
|
||||
/*
|
||||
* We've finished with this bpo's directly-referenced BP's and
|
||||
* it has no more unprocessed subobjs. We can free its
|
||||
* bpobj_info_t (unless it is the topmost, initial_bpo).
|
||||
* If we are freeing from disk, we can also do that.
|
||||
*/
|
||||
if (bpi->bpi_unprocessed_subobjs == 0) {
|
||||
/*
|
||||
* If there are no entries, there should
|
||||
* be no bytes.
|
||||
*/
|
||||
if (bpobj_is_empty(bpo)) {
|
||||
ASSERT0(bpo->bpo_phys->bpo_bytes);
|
||||
ASSERT0(bpo->bpo_phys->bpo_comp);
|
||||
ASSERT0(bpo->bpo_phys->bpo_uncomp);
|
||||
}
|
||||
}
|
||||
err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
|
||||
if (free) {
|
||||
VERIFY3U(0, ==, bpobj_space(&sublist,
|
||||
&used_after, &comp_after, &uncomp_after));
|
||||
bpo->bpo_phys->bpo_bytes -= used_before - used_after;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
|
||||
bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
|
||||
bpo->bpo_phys->bpo_uncomp -=
|
||||
uncomp_before - uncomp_after;
|
||||
}
|
||||
|
||||
bpobj_close(&sublist);
|
||||
if (err)
|
||||
break;
|
||||
if (free) {
|
||||
err = dmu_object_free(bpo->bpo_os,
|
||||
objarray[blkoff], tx);
|
||||
/* The initial_bpo has no parent and is not closed. */
|
||||
if (bpi->bpi_parent != NULL) {
|
||||
if (free) {
|
||||
bpobj_t *p = bpi->bpi_parent->bpi_bpo;
|
||||
|
||||
ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
|
||||
ASSERT3U(p->bpo_phys->bpo_num_subobjs,
|
||||
>, 0);
|
||||
ASSERT3U(bpi->bpi_index, ==,
|
||||
p->bpo_phys->bpo_num_subobjs - 1);
|
||||
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
|
||||
tx));
|
||||
|
||||
p->bpo_phys->bpo_num_subobjs--;
|
||||
|
||||
VERIFY0(dmu_free_range(p->bpo_os,
|
||||
p->bpo_phys->bpo_subobjs,
|
||||
bpi->bpi_index * sizeof (uint64_t),
|
||||
sizeof (uint64_t), tx));
|
||||
|
||||
/* eliminate the empty subobj list */
|
||||
if (bpo->bpo_havesubobj &&
|
||||
bpo->bpo_phys->bpo_subobjs != 0) {
|
||||
ASSERT0(bpo->bpo_phys->
|
||||
bpo_num_subobjs);
|
||||
err = dmu_object_free(
|
||||
bpo->bpo_os,
|
||||
bpo->bpo_phys->bpo_subobjs,
|
||||
tx);
|
||||
if (err)
|
||||
break;
|
||||
bpo->bpo_phys->bpo_subobjs = 0;
|
||||
}
|
||||
err = dmu_object_free(p->bpo_os,
|
||||
bpo->bpo_object, tx);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
bpobj_close(bpo);
|
||||
kmem_free(bpo, sizeof (bpobj_t));
|
||||
} else {
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Finished processing this bpo. Unlock, and free
|
||||
* our "stack" info.
|
||||
*/
|
||||
list_remove_head(&stack);
|
||||
kmem_free(bpi, sizeof (bpobj_info_t));
|
||||
} else {
|
||||
/*
|
||||
* We have unprocessed subobjs. Process the next one.
|
||||
*/
|
||||
ASSERT(bpo->bpo_havecomp);
|
||||
|
||||
/* Add the last subobj to stack. */
|
||||
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
|
||||
uint64_t offset = i * sizeof (uint64_t);
|
||||
|
||||
uint64_t obj_from_sublist;
|
||||
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
offset, sizeof (uint64_t), &obj_from_sublist,
|
||||
DMU_READ_PREFETCH);
|
||||
if (err)
|
||||
break;
|
||||
bpo->bpo_phys->bpo_num_subobjs--;
|
||||
ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
|
||||
bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
|
||||
KM_SLEEP);
|
||||
|
||||
err = bpobj_open(sublist, bpo->bpo_os,
|
||||
obj_from_sublist);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
|
||||
mutex_enter(&sublist->bpo_lock);
|
||||
bpi->bpi_unprocessed_subobjs--;
|
||||
}
|
||||
}
|
||||
if (dbuf) {
|
||||
dmu_buf_rele(dbuf, FTAG);
|
||||
dbuf = NULL;
|
||||
}
|
||||
if (free) {
|
||||
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
|
||||
bpo->bpo_phys->bpo_subobjs,
|
||||
(i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
|
||||
/*
|
||||
* Cleanup anything left on the "stack" after we left the loop.
|
||||
* Every bpo on the stack is locked so we must remember to undo
|
||||
* that now (in LIFO order).
|
||||
*/
|
||||
while ((bpi = list_remove_head(&stack)) != NULL) {
|
||||
bpobj_t *bpo = bpi->bpi_bpo;
|
||||
ASSERT(err != 0);
|
||||
ASSERT3P(bpo, !=, NULL);
|
||||
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
|
||||
/* do not free the initial_bpo */
|
||||
if (bpi->bpi_parent != NULL) {
|
||||
bpobj_close(bpi->bpi_bpo);
|
||||
kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
|
||||
}
|
||||
kmem_free(bpi, sizeof (bpobj_info_t));
|
||||
}
|
||||
|
||||
out:
|
||||
/* If there are no entries, there should be no bytes. */
|
||||
if (bpobj_is_empty(bpo)) {
|
||||
ASSERT0(bpo->bpo_phys->bpo_bytes);
|
||||
ASSERT0(bpo->bpo_phys->bpo_comp);
|
||||
ASSERT0(bpo->bpo_phys->bpo_uncomp);
|
||||
}
|
||||
list_destroy(&stack);
|
||||
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
@ -2311,6 +2311,23 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
for (dbuf_dirty_record_t *dr = db->db_last_dirty;
|
||||
dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
|
||||
if (dr->dr_txg == tx->tx_txg) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
return (B_TRUE);
|
||||
}
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
{
|
||||
@ -4564,6 +4581,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
|
||||
EXPORT_SYMBOL(dbuf_dirty);
|
||||
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
|
||||
EXPORT_SYMBOL(dmu_buf_will_dirty);
|
||||
EXPORT_SYMBOL(dmu_buf_is_dirty);
|
||||
EXPORT_SYMBOL(dmu_buf_will_not_fill);
|
||||
EXPORT_SYMBOL(dmu_buf_will_fill);
|
||||
EXPORT_SYMBOL(dmu_buf_fill_done);
|
||||
|
Loading…
Reference in New Issue
Block a user