mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
OpenZFS 7793 - ztest fails assertion in dmu_tx_willuse_space
Reviewed by: Steve Gonczi <steve.gonczi@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Background information: This assertion about tx_space_* verifies that we are not dirtying more stuff than we thought we would. We “need” to know how much we will dirty so that we can check if we should fail this transaction with ENOSPC/EDQUOT, in dmu_tx_assign(). While the transaction is open (i.e. between dmu_tx_assign() and dmu_tx_commit() — typically less than a millisecond), we call dbuf_dirty() on the exact blocks that will be modified. Once this happens, the temporary accounting in tx_space_* is unnecessary, because we know exactly what blocks are newly dirtied; we call dnode_willuse_space() to track this more exact accounting. The fundamental problem causing this bug is that dmu_tx_hold_*() relies on the current state in the DMU (e.g. dn_nlevels) to predict how much will be dirtied by this transaction, but this state can change before we actually perform the transaction (i.e. call dbuf_dirty()). This bug will be fixed by removing the assertion that the tx_space_* accounting is perfectly accurate (i.e. we never dirty more than was predicted by dmu_tx_hold_*()). By removing the requirement that this accounting be perfectly accurate, we can also vastly simplify it, e.g. removing most of the logic in dmu_tx_count_*(). The new tx space accounting will be very approximate, and may be more or less than what is actually dirtied. It will still be used to determine if this transaction will put us over quota. Transactions that are marked by dmu_tx_mark_netfree() will be excepted from this check. We won’t make an attempt to determine how much space will be freed by the transaction — this was rarely accurate enough to determine if a transaction should be permitted when we are over quota, which is why dmu_tx_mark_netfree() was introduced in 2014. We also won’t attempt to give “credit” when overwriting existing blocks, if those blocks may be freed. This allows us to remove the do_free_accounting logic in dbuf_dirty(), and associated routines. This logic attempted to predict what will be on disk when this txg syncs, to know if the overwritten block will be freed (i.e. exists, and has no snapshots). OpenZFS-issue: https://www.illumos.org/issues/7793 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/3704e0a Upstream bugs: DLPX-32883a Closes #5804 Porting notes: - DNODE_SIZE replaced with DNODE_MIN_SIZE in dmu_tx_count_dnode(), Using the default dnode size would be slightly better. - DEBUG_DMU_TX wrappers and configure option removed. - Resolved _by_dnode() conflicts these changes have not yet been applied to OpenZFS.
This commit is contained in:
+9
-75
@@ -1432,41 +1432,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
|
||||
mutex_exit(&dn->dn_dbufs_mtx);
|
||||
}
|
||||
|
||||
static int
|
||||
dbuf_block_freeable(dmu_buf_impl_t *db)
|
||||
{
|
||||
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
|
||||
uint64_t birth_txg = 0;
|
||||
|
||||
/*
|
||||
* We don't need any locking to protect db_blkptr:
|
||||
* If it's syncing, then db_last_dirty will be set
|
||||
* so we'll ignore db_blkptr.
|
||||
*
|
||||
* This logic ensures that only block births for
|
||||
* filled blocks are considered.
|
||||
*/
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
if (db->db_last_dirty && (db->db_blkptr == NULL ||
|
||||
!BP_IS_HOLE(db->db_blkptr))) {
|
||||
birth_txg = db->db_last_dirty->dr_txg;
|
||||
} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
|
||||
birth_txg = db->db_blkptr->blk_birth;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this block don't exist or is in a snapshot, it can't be freed.
|
||||
* Don't pass the bp to dsl_dataset_block_freeable() since we
|
||||
* are holding the db_mtx lock and might deadlock if we are
|
||||
* prefetching a dedup-ed block.
|
||||
*/
|
||||
if (birth_txg != 0)
|
||||
return (ds == NULL ||
|
||||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
|
||||
else
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
||||
{
|
||||
@@ -1516,7 +1481,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dnode_willuse_space(dn, size-osize, tx);
|
||||
dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
|
||||
DB_DNODE_EXIT(db);
|
||||
}
|
||||
|
||||
@@ -1566,7 +1531,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
objset_t *os;
|
||||
dbuf_dirty_record_t **drp, *dr;
|
||||
int drop_struct_lock = FALSE;
|
||||
boolean_t do_free_accounting = B_FALSE;
|
||||
int txgoff = tx->tx_txg & TXG_MASK;
|
||||
|
||||
ASSERT(tx->tx_txg != 0);
|
||||
@@ -1688,15 +1652,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
||||
|
||||
if (db->db_blkid != DMU_BONUS_BLKID) {
|
||||
/*
|
||||
* Update the accounting.
|
||||
* Note: we delay "free accounting" until after we drop
|
||||
* the db_mtx. This keeps us from grabbing other locks
|
||||
* (and possibly deadlocking) in bp_get_dsize() while
|
||||
* also holding the db_mtx.
|
||||
*/
|
||||
dnode_willuse_space(dn, db->db.db_size, tx);
|
||||
do_free_accounting = dbuf_block_freeable(db);
|
||||
dmu_objset_willuse_space(os, db->db.db_size, tx);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1790,21 +1746,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
||||
drop_struct_lock = TRUE;
|
||||
}
|
||||
|
||||
if (do_free_accounting) {
|
||||
blkptr_t *bp = db->db_blkptr;
|
||||
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
|
||||
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
|
||||
/*
|
||||
* This is only a guess -- if the dbuf is dirty
|
||||
* in a previous txg, we don't know how much
|
||||
* space it will use on disk yet. We should
|
||||
* really have the struct_rwlock to access
|
||||
* db_blkptr, but since this is just a guess,
|
||||
* it's OK if we get an odd answer.
|
||||
*/
|
||||
ddt_prefetch(os->os_spa, bp);
|
||||
dnode_willuse_space(dn, -willfree, tx);
|
||||
}
|
||||
/*
|
||||
* If we are overwriting a dedup BP, then unless it is snapshotted,
|
||||
* when we get to syncing context we will need to decrement its
|
||||
* refcount in the DDT. Prefetch the relevant DDT block so that
|
||||
* syncing context won't have to wait for the i/o.
|
||||
*/
|
||||
ddt_prefetch(os->os_spa, db->db_blkptr);
|
||||
|
||||
if (db->db_level == 0) {
|
||||
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
|
||||
@@ -3092,19 +3040,6 @@ dmu_buf_user_evict_wait()
|
||||
taskq_wait(dbu_evict_taskq);
|
||||
}
|
||||
|
||||
boolean_t
|
||||
dmu_buf_freeable(dmu_buf_t *dbuf)
|
||||
{
|
||||
boolean_t res = B_FALSE;
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
||||
|
||||
if (db->db_blkptr)
|
||||
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
|
||||
db->db_blkptr, db->db_blkptr->blk_birth);
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
blkptr_t *
|
||||
dmu_buf_get_blkptr(dmu_buf_t *db)
|
||||
{
|
||||
@@ -3891,7 +3826,6 @@ EXPORT_SYMBOL(dbuf_sync_list);
|
||||
EXPORT_SYMBOL(dmu_buf_set_user);
|
||||
EXPORT_SYMBOL(dmu_buf_set_user_ie);
|
||||
EXPORT_SYMBOL(dmu_buf_get_user);
|
||||
EXPORT_SYMBOL(dmu_buf_freeable);
|
||||
EXPORT_SYMBOL(dmu_buf_get_blkptr);
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
|
||||
@@ -2344,6 +2344,23 @@ dmu_fsname(const char *snapname, char *buf)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call when we think we're going to write/free space in open context to track
|
||||
* the amount of dirty data in the open txg, which is also the amount
|
||||
* of memory that can not be evicted until this txg syncs.
|
||||
*/
|
||||
void
|
||||
dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
|
||||
{
|
||||
dsl_dataset_t *ds = os->os_dsl_dataset;
|
||||
int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
|
||||
|
||||
if (ds != NULL) {
|
||||
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
|
||||
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
EXPORT_SYMBOL(dmu_objset_zil);
|
||||
EXPORT_SYMBOL(dmu_objset_pool);
|
||||
|
||||
+189
-682
File diff suppressed because it is too large
Load Diff
@@ -1948,25 +1948,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call when we think we're going to write/free space in open context to track
|
||||
* the amount of memory in use by the currently open txg.
|
||||
*/
|
||||
void
|
||||
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
|
||||
{
|
||||
objset_t *os = dn->dn_objset;
|
||||
dsl_dataset_t *ds = os->os_dsl_dataset;
|
||||
int64_t aspace = spa_get_asize(os->os_spa, space);
|
||||
|
||||
if (ds != NULL) {
|
||||
dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
|
||||
dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
|
||||
}
|
||||
|
||||
dmu_tx_willuse_space(tx, aspace);
|
||||
}
|
||||
|
||||
/*
|
||||
* Scans a block at the indicated "level" looking for a hole or data,
|
||||
* depending on 'flags'.
|
||||
|
||||
@@ -242,42 +242,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
|
||||
return (used);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
|
||||
{
|
||||
uint64_t trysnap = 0;
|
||||
|
||||
if (ds == NULL)
|
||||
return (0);
|
||||
/*
|
||||
* The snapshot creation could fail, but that would cause an
|
||||
* incorrect FALSE return, which would only result in an
|
||||
* overestimation of the amount of space that an operation would
|
||||
* consume, which is OK.
|
||||
*
|
||||
* There's also a small window where we could miss a pending
|
||||
* snapshot, because we could set the sync task in the quiescing
|
||||
* phase. So this should only be used as a guess.
|
||||
*/
|
||||
if (ds->ds_trysnap_txg >
|
||||
spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
|
||||
trysnap = ds->ds_trysnap_txg;
|
||||
return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
|
||||
}
|
||||
|
||||
boolean_t
|
||||
dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
|
||||
uint64_t blk_birth)
|
||||
{
|
||||
if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
|
||||
(bp != NULL && BP_IS_HOLE(bp)))
|
||||
return (B_FALSE);
|
||||
|
||||
ddt_prefetch(dsl_dataset_get_spa(ds), bp);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to release the fsid syncronously or we risk that a subsequent
|
||||
* mount of the same dataset will fail to unique_insert the fsid. This
|
||||
@@ -3731,8 +3695,6 @@ EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
|
||||
EXPORT_SYMBOL(dsl_dataset_sync);
|
||||
EXPORT_SYMBOL(dsl_dataset_block_born);
|
||||
EXPORT_SYMBOL(dsl_dataset_block_kill);
|
||||
EXPORT_SYMBOL(dsl_dataset_block_freeable);
|
||||
EXPORT_SYMBOL(dsl_dataset_prev_snap_txg);
|
||||
EXPORT_SYMBOL(dsl_dataset_dirty);
|
||||
EXPORT_SYMBOL(dsl_dataset_stats);
|
||||
EXPORT_SYMBOL(dsl_dataset_fast_stat);
|
||||
|
||||
+18
-22
@@ -1031,13 +1031,12 @@ static uint64_t
|
||||
dsl_dir_space_towrite(dsl_dir_t *dd)
|
||||
{
|
||||
uint64_t space = 0;
|
||||
int i;
|
||||
|
||||
ASSERT(MUTEX_HELD(&dd->dd_lock));
|
||||
|
||||
for (i = 0; i < TXG_SIZE; i++) {
|
||||
space += dd->dd_space_towrite[i&TXG_MASK];
|
||||
ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
|
||||
for (int i = 0; i < TXG_SIZE; i++) {
|
||||
space += dd->dd_space_towrite[i & TXG_MASK];
|
||||
ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
|
||||
}
|
||||
return (space);
|
||||
}
|
||||
@@ -1117,16 +1116,13 @@ struct tempreserve {
|
||||
|
||||
static int
|
||||
dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
|
||||
boolean_t ignorequota, list_t *tr_list,
|
||||
dmu_tx_t *tx, boolean_t first)
|
||||
{
|
||||
uint64_t txg = tx->tx_txg;
|
||||
uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
|
||||
uint64_t deferred = 0;
|
||||
uint64_t quota;
|
||||
struct tempreserve *tr;
|
||||
int retval = EDQUOT;
|
||||
int txgidx = txg & TXG_MASK;
|
||||
int i;
|
||||
uint64_t ref_rsrv = 0;
|
||||
|
||||
ASSERT3U(txg, !=, 0);
|
||||
@@ -1138,10 +1134,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
* Check against the dsl_dir's quota. We don't add in the delta
|
||||
* when checking for over-quota because they get one free hit.
|
||||
*/
|
||||
est_inflight = dsl_dir_space_towrite(dd);
|
||||
for (i = 0; i < TXG_SIZE; i++)
|
||||
uint64_t est_inflight = dsl_dir_space_towrite(dd);
|
||||
for (int i = 0; i < TXG_SIZE; i++)
|
||||
est_inflight += dd->dd_tempreserved[i];
|
||||
used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
|
||||
uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
|
||||
|
||||
/*
|
||||
* On the first iteration, fetch the dataset's used-on-disk and
|
||||
@@ -1152,9 +1148,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
int error;
|
||||
dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
|
||||
|
||||
error = dsl_dataset_check_quota(ds, checkrefquota,
|
||||
error = dsl_dataset_check_quota(ds, !netfree,
|
||||
asize, est_inflight, &used_on_disk, &ref_rsrv);
|
||||
if (error) {
|
||||
if (error != 0) {
|
||||
mutex_exit(&dd->dd_lock);
|
||||
DMU_TX_STAT_BUMP(dmu_tx_quota);
|
||||
return (error);
|
||||
@@ -1180,6 +1176,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
* we're very close to full, this will allow a steady trickle of
|
||||
* removes to get through.
|
||||
*/
|
||||
uint64_t deferred = 0;
|
||||
if (dd->dd_parent == NULL) {
|
||||
spa_t *spa = dd->dd_pool->dp_spa;
|
||||
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
|
||||
@@ -1210,9 +1207,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
}
|
||||
|
||||
/* We need to up our estimated delta before dropping dd_lock */
|
||||
dd->dd_tempreserved[txgidx] += asize;
|
||||
dd->dd_tempreserved[txg & TXG_MASK] += asize;
|
||||
|
||||
parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
|
||||
uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
|
||||
asize - ref_rsrv);
|
||||
mutex_exit(&dd->dd_lock);
|
||||
|
||||
@@ -1222,11 +1219,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
list_insert_tail(tr_list, tr);
|
||||
|
||||
/* see if it's OK with our parent */
|
||||
if (dd->dd_parent && parent_rsrv) {
|
||||
if (dd->dd_parent != NULL && parent_rsrv != 0) {
|
||||
boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
|
||||
|
||||
return (dsl_dir_tempreserve_impl(dd->dd_parent,
|
||||
parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
|
||||
parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE));
|
||||
} else {
|
||||
return (0);
|
||||
}
|
||||
@@ -1240,7 +1237,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
|
||||
*/
|
||||
int
|
||||
dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
|
||||
uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
|
||||
boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
|
||||
{
|
||||
int err;
|
||||
list_t *tr_list;
|
||||
@@ -1254,7 +1251,6 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
|
||||
list_create(tr_list, sizeof (struct tempreserve),
|
||||
offsetof(struct tempreserve, tr_node));
|
||||
ASSERT3S(asize, >, 0);
|
||||
ASSERT3S(fsize, >=, 0);
|
||||
|
||||
err = arc_tempreserve_space(lsize, tx->tx_txg);
|
||||
if (err == 0) {
|
||||
@@ -1281,8 +1277,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
|
||||
}
|
||||
|
||||
if (err == 0) {
|
||||
err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
|
||||
FALSE, asize > usize, tr_list, tx, TRUE);
|
||||
err = dsl_dir_tempreserve_impl(dd, asize, netfree,
|
||||
B_FALSE, tr_list, tx, B_TRUE);
|
||||
}
|
||||
|
||||
if (err != 0)
|
||||
|
||||
@@ -1615,7 +1615,7 @@ spa_freeze_txg(spa_t *spa)
|
||||
|
||||
/* ARGSUSED */
|
||||
uint64_t
|
||||
spa_get_asize(spa_t *spa, uint64_t lsize)
|
||||
spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
|
||||
{
|
||||
return (lsize * spa_asize_inflation);
|
||||
}
|
||||
@@ -2078,7 +2078,6 @@ EXPORT_SYMBOL(spa_version);
|
||||
EXPORT_SYMBOL(spa_state);
|
||||
EXPORT_SYMBOL(spa_load_state);
|
||||
EXPORT_SYMBOL(spa_freeze_txg);
|
||||
EXPORT_SYMBOL(spa_get_asize);
|
||||
EXPORT_SYMBOL(spa_get_dspace);
|
||||
EXPORT_SYMBOL(spa_update_dspace);
|
||||
EXPORT_SYMBOL(spa_deflate);
|
||||
|
||||
@@ -1357,64 +1357,3 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
|
||||
refcount_t *tooverwrite)
|
||||
{
|
||||
zap_t *zap = zn->zn_zap;
|
||||
zap_leaf_t *l;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* Account for the header block of the fatzap.
|
||||
*/
|
||||
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
|
||||
(void) refcount_add_many(tooverwrite,
|
||||
zap->zap_dbuf->db_size, FTAG);
|
||||
} else {
|
||||
(void) refcount_add_many(towrite,
|
||||
zap->zap_dbuf->db_size, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for the pointer table blocks.
|
||||
* If we are adding we need to account for the following cases :
|
||||
* - If the pointer table is embedded, this operation could force an
|
||||
* external pointer table.
|
||||
* - If this already has an external pointer table this operation
|
||||
* could extend the table.
|
||||
*/
|
||||
if (add) {
|
||||
if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
|
||||
(void) refcount_add_many(towrite,
|
||||
zap->zap_dbuf->db_size, FTAG);
|
||||
} else {
|
||||
(void) refcount_add_many(towrite,
|
||||
zap->zap_dbuf->db_size * 3, FTAG);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Now, check if the block containing leaf is freeable
|
||||
* and account accordingly.
|
||||
*/
|
||||
err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
|
||||
if (err != 0) {
|
||||
return (err);
|
||||
}
|
||||
|
||||
if (!add && dmu_buf_freeable(l->l_dbuf)) {
|
||||
(void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
|
||||
} else {
|
||||
/*
|
||||
* If this an add operation, the leaf block could split.
|
||||
* Hence, we need to account for an additional leaf block.
|
||||
*/
|
||||
(void) refcount_add_many(towrite,
|
||||
(add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
|
||||
}
|
||||
|
||||
zap_put_leaf(l);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -1594,88 +1594,6 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
|
||||
refcount_t *towrite, refcount_t *tooverwrite)
|
||||
{
|
||||
zap_t *zap;
|
||||
int err = 0;
|
||||
|
||||
/*
|
||||
* Since, we don't have a name, we cannot figure out which blocks will
|
||||
* be affected in this operation. So, account for the worst case :
|
||||
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
|
||||
* - 4 new blocks written if adding:
|
||||
* - 2 blocks for possibly split leaves,
|
||||
* - 2 grown ptrtbl blocks
|
||||
*
|
||||
* This also accommodates the case where an add operation to a fairly
|
||||
* large microzap results in a promotion to fatzap.
|
||||
*/
|
||||
if (name == NULL) {
|
||||
(void) refcount_add_many(towrite,
|
||||
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
|
||||
return (err);
|
||||
}
|
||||
|
||||
/*
|
||||
* We lock the zap with adding == FALSE. Because, if we pass
|
||||
* the actual value of add, it could trigger a mzap_upgrade().
|
||||
* At present we are just evaluating the possibility of this operation
|
||||
* and hence we do not want to trigger an upgrade.
|
||||
*/
|
||||
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
|
||||
FTAG, &zap);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
|
||||
if (!zap->zap_ismicro) {
|
||||
zap_name_t *zn = zap_name_alloc(zap, name, 0);
|
||||
if (zn) {
|
||||
err = fzap_count_write(zn, add, towrite,
|
||||
tooverwrite);
|
||||
zap_name_free(zn);
|
||||
} else {
|
||||
/*
|
||||
* We treat this case as similar to (name == NULL)
|
||||
*/
|
||||
(void) refcount_add_many(towrite,
|
||||
(3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We are here if (name != NULL) and this is a micro-zap.
|
||||
* We account for the header block depending on whether it
|
||||
* is freeable.
|
||||
*
|
||||
* Incase of an add-operation it is hard to find out
|
||||
* if this add will promote this microzap to fatzap.
|
||||
* Hence, we consider the worst case and account for the
|
||||
* blocks assuming this microzap would be promoted to a
|
||||
* fatzap.
|
||||
*
|
||||
* 1 block overwritten : header block
|
||||
* 4 new blocks written : 2 new split leaf, 2 grown
|
||||
* ptrtbl blocks
|
||||
*/
|
||||
if (dmu_buf_freeable(zap->zap_dbuf)) {
|
||||
(void) refcount_add_many(tooverwrite,
|
||||
MZAP_MAX_BLKSZ, FTAG);
|
||||
} else {
|
||||
(void) refcount_add_many(towrite,
|
||||
MZAP_MAX_BLKSZ, FTAG);
|
||||
}
|
||||
|
||||
if (add) {
|
||||
(void) refcount_add_many(towrite,
|
||||
4 * MZAP_MAX_BLKSZ, FTAG);
|
||||
}
|
||||
}
|
||||
|
||||
zap_unlockdir(zap, FTAG);
|
||||
return (err);
|
||||
}
|
||||
|
||||
#if defined(_KERNEL) && defined(HAVE_SPL)
|
||||
EXPORT_SYMBOL(zap_create);
|
||||
EXPORT_SYMBOL(zap_create_dnsize);
|
||||
@@ -1694,7 +1612,6 @@ EXPORT_SYMBOL(zap_lookup_uint64);
|
||||
EXPORT_SYMBOL(zap_contains);
|
||||
EXPORT_SYMBOL(zap_prefetch);
|
||||
EXPORT_SYMBOL(zap_prefetch_uint64);
|
||||
EXPORT_SYMBOL(zap_count_write_by_dnode);
|
||||
EXPORT_SYMBOL(zap_add);
|
||||
EXPORT_SYMBOL(zap_add_by_dnode);
|
||||
EXPORT_SYMBOL(zap_add_uint64);
|
||||
|
||||
Reference in New Issue
Block a user