diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 66ab84df5..94bd1fd8f 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -377,11 +377,98 @@ bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); } +/* + * Logically add subobj's contents to the parent bpobj. + * + * In the most general case, this is accomplished in constant time by adding + * a reference to subobj. This case is used when enqueuing a large subobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * Result: sub-bpobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | + * +----+----+----+----+----+ +-----+-----+-----+--|--+ + * | + * /-----------------------------------------------------/ + * v + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * + * In a common case, the subobj is small: its bp's and its list of subobj's + * are each stored in a single block. In this case we copy the subobj's + * contents to the parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ ^ + * +--------------+ | +--------------+ | + * | sub-bpobj |---------^------------> | subsubobj | ^ + * +----+----+----+ | +-----+-----+--+ | + * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ + * +----+----+ +-----+-----+ + * + * Result: subobj destroyed, contents copied to parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ + * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | + * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ + * + * + * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's + * but retain the sub-bpobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ + * +--------------+ +--------------+ | + * | sub-bpobj |----------------------> | subsubobj | ^ + * +----+----+----+----+---------+----+ +-----+-----+--+ | + * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ + * +----+----+----+----+---------+----+ +-----+-----+ + * + * Result: sub-sub-bpobjs and subobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |-------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | + * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ + * | + * /--------------------------------------------------------------/ + * v + * +--------------+ + * | sub-bpobj | + * +----+----+----+----+---------+----+ + * | bp | bp | bp | bp | ... | bp | + * +----+----+----+----+---------+----+ + */ void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) { bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; ASSERT(bpobj_is_open(bpo)); ASSERT(subobj != 0); @@ -406,61 +493,110 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); - if (bpo->bpo_phys->bpo_subobjs == 0) { - bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, - DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, - DMU_OT_NONE, 0, tx); + + dmu_object_info_t doi; + + if (bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); } - ASSERTV(dmu_object_info_t doi); - ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); - - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - sizeof (subobj), &subobj, tx); - bpo->bpo_phys->bpo_num_subobjs++; - /* * If subobj has only one block of subobjs, then move subobj's - * subobjs to bpo's subobj list directly. This reduces - * recursion in bpobj_iterate due to nested subobjs. + * subobjs to bpo's subobj list directly. This reduces recursion in + * bpobj_iterate due to nested subobjs. */ subsubobjs = subbpo.bpo_phys->bpo_subobjs; if (subsubobjs != 0) { - dmu_object_info_t doi; - - VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); - if (doi.doi_max_offset == doi.doi_data_block_size) { - dmu_buf_t *subdb; - uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; - - VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, - 0, FTAG, &subdb, 0)); - /* - * Make sure that we are not asking dmu_write() - * to write more data than we have in our buffer. - */ - VERIFY3U(subdb->db_size, >=, - numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); - dmu_buf_rele(subdb, FTAG); - bpo->bpo_phys->bpo_num_subobjs += numsubsub; - - dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); - subbpo.bpo_phys->bpo_subobjs = 0; - VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, - subsubobjs, tx)); + VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size) { + copy_subsub = B_FALSE; } } + + /* + * If, in addition to having only one block of subobj's, subobj has + * only one block of bp's, then move subobj's bp's to bpo's bp list + * directly. This reduces recursion in bpobj_iterate due to nested + * subobjs. + */ + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { + copy_bps = B_FALSE; + } + + if (copy_subsub && subsubobjs != 0) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(subdb->db_size, >=, + numsubsub * sizeof (subobj)); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); + } + + if (copy_bps) { + dmu_buf_t *bps; + uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; + + ASSERT(copy_subsub); + VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, + 0, FTAG, &bps, 0)); + + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); + dmu_write(bpo->bpo_os, bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + numbps * sizeof (blkptr_t), + bps->db_data, tx); + dmu_buf_rele(bps, FTAG); + bpo->bpo_phys->bpo_num_blkptrs += numbps; + + bpobj_close(&subbpo); + VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); + } else { + bpobj_close(&subbpo); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + } + bpo->bpo_phys->bpo_bytes += used; bpo->bpo_phys->bpo_comp += comp; bpo->bpo_phys->bpo_uncomp += uncomp; mutex_exit(&bpo->bpo_lock); - bpobj_close(&subbpo); } void