mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
bpobj_enqueue_subobj() should copy small subobj's
When we delete a snapshot, we consolidate some bpobj's together because we no longer need to keep their entries in separate buckets. This is done in constant time by including the "sub" bpobj by reference in the parent bpobj. After many snapshots have been deleted, we may have many sub-bpobj's. Usually, most sub-bpobj's don't contain many BP's. Compared to this small payload, the sub-bpobj is relatively heavyweight since it is a object in the MOS. A common scenario on a long-lived pool is for the vast majority of MOS objects to be small sub-bpobj's. To improve this situation, when consolidating bpobj's together, bpobj_enqueue_subobj() can copy the contents of small bpobj's into the parent, and then delete the enqueued bpobj, rather than including it by reference. Since this copying is limited in size (to one block), the consolidation is still constant time, though with a larger constant due to reading in the one block of the enqueued bpobj. This idea and mechanism are similar to how we handle "sub-subobj's". When including a sub-bpobj by reference, if the sub-bpobj itself has less than a block of sub-sub-bpobj's, the list of sub-sub-bpobj's is copied to the parent bpobj's list of sub-bpobj's. Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Paul Zuchowski <pzuchowski@datto.com> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #8053 Issue #7908
This commit is contained in:
parent
82c0a050fc
commit
9553c533a6
@ -377,11 +377,98 @@ bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
|
||||
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
|
||||
}
|
||||
|
||||
/*
|
||||
* Logically add subobj's contents to the parent bpobj.
|
||||
*
|
||||
* In the most general case, this is accomplished in constant time by adding
|
||||
* a reference to subobj. This case is used when enqueuing a large subobj:
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |----------------------->| subobj list |
|
||||
* +----+----+----+----+----+ +-----+-----+--+--+
|
||||
* | bp | bp | bp | bp | bp | | obj | obj | obj |
|
||||
* +----+----+----+----+----+ +-----+-----+-----+
|
||||
*
|
||||
* +--------------+ +--------------+
|
||||
* | sub-bpobj |----------------------> | subsubobj |
|
||||
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
|
||||
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
|
||||
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
|
||||
*
|
||||
* Result: sub-bpobj added to parent's subobj list.
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |----------------------->| subobj list |
|
||||
* +----+----+----+----+----+ +-----+-----+--+--+-----+
|
||||
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
|
||||
* +----+----+----+----+----+ +-----+-----+-----+--|--+
|
||||
* |
|
||||
* /-----------------------------------------------------/
|
||||
* v
|
||||
* +--------------+ +--------------+
|
||||
* | sub-bpobj |----------------------> | subsubobj |
|
||||
* +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
|
||||
* | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
|
||||
* +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
|
||||
*
|
||||
*
|
||||
* In a common case, the subobj is small: its bp's and its list of subobj's
|
||||
* are each stored in a single block. In this case we copy the subobj's
|
||||
* contents to the parent:
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |----------------------->| subobj list |
|
||||
* +----+----+----+----+----+ +-----+-----+--+--+
|
||||
* | bp | bp | bp | bp | bp | | obj | obj | obj |
|
||||
* +----+----+----+----+----+ +-----+-----+-----+
|
||||
* ^ ^
|
||||
* +--------------+ | +--------------+ |
|
||||
* | sub-bpobj |---------^------------> | subsubobj | ^
|
||||
* +----+----+----+ | +-----+-----+--+ |
|
||||
* | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
|
||||
* +----+----+ +-----+-----+
|
||||
*
|
||||
* Result: subobj destroyed, contents copied to parent:
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |----------------------->| subobj list |
|
||||
* +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
|
||||
* | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
|
||||
* +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
|
||||
*
|
||||
*
|
||||
* If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
|
||||
* but retain the sub-bpobj:
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |----------------------->| subobj list |
|
||||
* +----+----+----+----+----+ +-----+-----+--+--+
|
||||
* | bp | bp | bp | bp | bp | | obj | obj | obj |
|
||||
* +----+----+----+----+----+ +-----+-----+-----+
|
||||
* ^
|
||||
* +--------------+ +--------------+ |
|
||||
* | sub-bpobj |----------------------> | subsubobj | ^
|
||||
* +----+----+----+----+---------+----+ +-----+-----+--+ |
|
||||
* | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
|
||||
* +----+----+----+----+---------+----+ +-----+-----+
|
||||
*
|
||||
* Result: sub-sub-bpobjs and subobj added to parent's subobj list.
|
||||
* +--------------+ +--------------+
|
||||
* | bpobj |-------------------->| subobj list |
|
||||
* +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
|
||||
* | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
|
||||
* +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
|
||||
* |
|
||||
* /--------------------------------------------------------------/
|
||||
* v
|
||||
* +--------------+
|
||||
* | sub-bpobj |
|
||||
* +----+----+----+----+---------+----+
|
||||
* | bp | bp | bp | bp | ... | bp |
|
||||
* +----+----+----+----+---------+----+
|
||||
*/
|
||||
void
|
||||
bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||
{
|
||||
bpobj_t subbpo;
|
||||
uint64_t used, comp, uncomp, subsubobjs;
|
||||
boolean_t copy_subsub = B_TRUE;
|
||||
boolean_t copy_bps = B_TRUE;
|
||||
|
||||
ASSERT(bpobj_is_open(bpo));
|
||||
ASSERT(subobj != 0);
|
||||
@ -406,61 +493,110 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
|
||||
|
||||
mutex_enter(&bpo->bpo_lock);
|
||||
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
|
||||
if (bpo->bpo_phys->bpo_subobjs == 0) {
|
||||
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
|
||||
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
|
||||
DMU_OT_NONE, 0, tx);
|
||||
|
||||
dmu_object_info_t doi;
|
||||
|
||||
if (bpo->bpo_phys->bpo_subobjs != 0) {
|
||||
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
&doi));
|
||||
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
|
||||
}
|
||||
|
||||
ASSERTV(dmu_object_info_t doi);
|
||||
ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
|
||||
ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
|
||||
|
||||
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
|
||||
sizeof (subobj), &subobj, tx);
|
||||
bpo->bpo_phys->bpo_num_subobjs++;
|
||||
|
||||
/*
|
||||
* If subobj has only one block of subobjs, then move subobj's
|
||||
* subobjs to bpo's subobj list directly. This reduces
|
||||
* recursion in bpobj_iterate due to nested subobjs.
|
||||
* subobjs to bpo's subobj list directly. This reduces recursion in
|
||||
* bpobj_iterate due to nested subobjs.
|
||||
*/
|
||||
subsubobjs = subbpo.bpo_phys->bpo_subobjs;
|
||||
if (subsubobjs != 0) {
|
||||
dmu_object_info_t doi;
|
||||
|
||||
VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
|
||||
if (doi.doi_max_offset == doi.doi_data_block_size) {
|
||||
dmu_buf_t *subdb;
|
||||
uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
|
||||
|
||||
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
|
||||
0, FTAG, &subdb, 0));
|
||||
/*
|
||||
* Make sure that we are not asking dmu_write()
|
||||
* to write more data than we have in our buffer.
|
||||
*/
|
||||
VERIFY3U(subdb->db_size, >=,
|
||||
numsubsub * sizeof (subobj));
|
||||
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
|
||||
numsubsub * sizeof (subobj), subdb->db_data, tx);
|
||||
dmu_buf_rele(subdb, FTAG);
|
||||
bpo->bpo_phys->bpo_num_subobjs += numsubsub;
|
||||
|
||||
dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
|
||||
subbpo.bpo_phys->bpo_subobjs = 0;
|
||||
VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
|
||||
subsubobjs, tx));
|
||||
VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
|
||||
if (doi.doi_max_offset > doi.doi_data_block_size) {
|
||||
copy_subsub = B_FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If, in addition to having only one block of subobj's, subobj has
|
||||
* only one block of bp's, then move subobj's bp's to bpo's bp list
|
||||
* directly. This reduces recursion in bpobj_iterate due to nested
|
||||
* subobjs.
|
||||
*/
|
||||
VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
|
||||
if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
|
||||
copy_bps = B_FALSE;
|
||||
}
|
||||
|
||||
if (copy_subsub && subsubobjs != 0) {
|
||||
dmu_buf_t *subdb;
|
||||
uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
|
||||
|
||||
VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
|
||||
0, FTAG, &subdb, 0));
|
||||
/*
|
||||
* Make sure that we are not asking dmu_write()
|
||||
* to write more data than we have in our buffer.
|
||||
*/
|
||||
VERIFY3U(subdb->db_size, >=,
|
||||
numsubsub * sizeof (subobj));
|
||||
if (bpo->bpo_phys->bpo_subobjs == 0) {
|
||||
bpo->bpo_phys->bpo_subobjs =
|
||||
dmu_object_alloc(bpo->bpo_os,
|
||||
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
|
||||
DMU_OT_NONE, 0, tx);
|
||||
}
|
||||
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
|
||||
numsubsub * sizeof (subobj), subdb->db_data, tx);
|
||||
dmu_buf_rele(subdb, FTAG);
|
||||
bpo->bpo_phys->bpo_num_subobjs += numsubsub;
|
||||
|
||||
dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
|
||||
subbpo.bpo_phys->bpo_subobjs = 0;
|
||||
VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
|
||||
}
|
||||
|
||||
if (copy_bps) {
|
||||
dmu_buf_t *bps;
|
||||
uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
|
||||
|
||||
ASSERT(copy_subsub);
|
||||
VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
|
||||
0, FTAG, &bps, 0));
|
||||
|
||||
/*
|
||||
* Make sure that we are not asking dmu_write()
|
||||
* to write more data than we have in our buffer.
|
||||
*/
|
||||
VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
|
||||
dmu_write(bpo->bpo_os, bpo->bpo_object,
|
||||
bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
|
||||
numbps * sizeof (blkptr_t),
|
||||
bps->db_data, tx);
|
||||
dmu_buf_rele(bps, FTAG);
|
||||
bpo->bpo_phys->bpo_num_blkptrs += numbps;
|
||||
|
||||
bpobj_close(&subbpo);
|
||||
VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
|
||||
} else {
|
||||
bpobj_close(&subbpo);
|
||||
if (bpo->bpo_phys->bpo_subobjs == 0) {
|
||||
bpo->bpo_phys->bpo_subobjs =
|
||||
dmu_object_alloc(bpo->bpo_os,
|
||||
DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
|
||||
DMU_OT_NONE, 0, tx);
|
||||
}
|
||||
|
||||
dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
|
||||
bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
|
||||
sizeof (subobj), &subobj, tx);
|
||||
bpo->bpo_phys->bpo_num_subobjs++;
|
||||
}
|
||||
|
||||
bpo->bpo_phys->bpo_bytes += used;
|
||||
bpo->bpo_phys->bpo_comp += comp;
|
||||
bpo->bpo_phys->bpo_uncomp += uncomp;
|
||||
mutex_exit(&bpo->bpo_lock);
|
||||
|
||||
bpobj_close(&subbpo);
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
Reference in New Issue
Block a user