mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 11:18:52 +03:00
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the same directory, on a machine with 16 CPU cores, I observed poor performance. I noticed that dmu_tx_hold_zap() was using about 30% of all CPU, and doing dnode_hold() 7 times on the same object (the ZAP object that is being held). dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the dnode_t that we already have in hand, rather than repeatedly calling dnode_hold(). To do this, we need to pass the dnode_t down through all the intermediate calls that dmu_tx_hold_zap() makes, making these routines take the dnode_t* rather than an objset_t* and a uint64_t object number. In particular, the following routines will need to have analogous *_by_dnode() variants created: dmu_buf_hold_noread() dmu_buf_hold() zap_lookup() zap_lookup_norm() zap_count_write() zap_lockdir() zap_count_write() This can improve performance on the benchmark described above by 100%, from 30,000 file creations per second to 60,000. (This improvement is on top of that provided by working around the object allocation issue. Peak performance of ~90,000 creations per second was observed with 8 CPUs; adding CPUs past that decreased performance due to lock contention.) The CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds to 40 CPU-seconds. Sponsored by: Intel Corp. Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/7004 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109 Closes #4641 Closes #4972
This commit is contained in:
committed by
Brian Behlendorf
parent
8bea981504
commit
2bce8049c3
+16
-3
@@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
uint64_t blk, off;
|
||||
int err;
|
||||
dmu_buf_t *db;
|
||||
dnode_t *dn;
|
||||
int bs = FZAP_BLOCK_SHIFT(zap);
|
||||
|
||||
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
|
||||
@@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
blk = idx >> (bs-3);
|
||||
off = idx & ((1<<(bs-3))-1);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
/*
|
||||
* Note: this is equivalent to dmu_buf_hold(), but we use
|
||||
* _dnode_enter / _by_dnode because it's faster because we don't
|
||||
* have to hold the dnode.
|
||||
*/
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err)
|
||||
return (err);
|
||||
*valp = ((uint64_t *)db->db_data)[off];
|
||||
@@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
*/
|
||||
blk = (idx*2) >> (bs-3);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err == 0)
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
@@ -502,6 +512,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
|
||||
zap_leaf_t *l;
|
||||
int bs = FZAP_BLOCK_SHIFT(zap);
|
||||
int err;
|
||||
dnode_t *dn;
|
||||
|
||||
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
|
||||
|
||||
@@ -515,8 +526,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
|
||||
if (blkid == 0)
|
||||
return (ENOENT);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user