mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the same directory, on a machine with 16 CPU cores, I observed poor performance. I noticed that dmu_tx_hold_zap() was using about 30% of all CPU, and doing dnode_hold() 7 times on the same object (the ZAP object that is being held). dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the dnode_t that we already have in hand, rather than repeatedly calling dnode_hold(). To do this, we need to pass the dnode_t down through all the intermediate calls that dmu_tx_hold_zap() makes, making these routines take the dnode_t* rather than an objset_t* and a uint64_t object number. In particular, the following routines will need to have analogous *_by_dnode() variants created: dmu_buf_hold_noread() dmu_buf_hold() zap_lookup() zap_lookup_norm() zap_count_write() zap_lockdir() zap_count_write() This can improve performance on the benchmark described above by 100%, from 30,000 file creations per second to 60,000. (This improvement is on top of that provided by working around the object allocation issue. Peak performance of ~90,000 creations per second was observed with 8 CPUs; adding CPUs past that decreased performance due to lock contention.) The CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds to 40 CPU-seconds. Sponsored by: Intel Corp. Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/7004 OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109 Closes #4641 Closes #4972
This commit is contained in:
parent
8bea981504
commit
2bce8049c3
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||
* Copyright 2014 HybridCluster. All rights reserved.
|
||||
@ -73,6 +73,7 @@ struct sa_handle;
|
||||
typedef struct objset objset_t;
|
||||
typedef struct dmu_tx dmu_tx_t;
|
||||
typedef struct dsl_dir dsl_dir_t;
|
||||
typedef struct dnode dnode_t;
|
||||
|
||||
typedef enum dmu_object_byteswap {
|
||||
DMU_BSWAP_UINT8,
|
||||
@ -420,7 +421,7 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
|
||||
#define WP_DMU_SYNC 0x2
|
||||
#define WP_SPILL 0x4
|
||||
|
||||
void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
|
||||
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
|
||||
struct zio_prop *zp);
|
||||
/*
|
||||
* The bonus data is accessed more or less like a regular buffer.
|
||||
@ -446,7 +447,7 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
|
||||
*/
|
||||
|
||||
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
|
||||
int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
|
||||
int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
|
||||
void *tag, dmu_buf_t **dbp);
|
||||
int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
|
||||
|
||||
@ -466,6 +467,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
|
||||
*/
|
||||
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
|
||||
void *tag, dmu_buf_t **, int flags);
|
||||
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
|
||||
void *tag, dmu_buf_t **dbp, int flags);
|
||||
|
||||
/*
|
||||
* Add a reference to a dmu buffer that has already been held via
|
||||
@ -620,6 +623,8 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
|
||||
void *dmu_buf_get_user(dmu_buf_t *db);
|
||||
|
||||
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
|
||||
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
|
||||
void dmu_buf_dnode_exit(dmu_buf_t *db);
|
||||
|
||||
/* Block until any in-progress dmu buf user evictions complete. */
|
||||
void dmu_buf_user_evict_wait(void);
|
||||
@ -799,7 +804,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
|
||||
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
|
||||
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
|
||||
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
|
||||
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
|
||||
void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
|
||||
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
|
||||
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
|
||||
/*
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
*/
|
||||
|
||||
@ -187,7 +187,7 @@ typedef struct dnode_phys {
|
||||
#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
|
||||
(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
|
||||
|
||||
typedef struct dnode {
|
||||
struct dnode {
|
||||
/*
|
||||
* Protects the structure of the dnode, including the number of levels
|
||||
* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
|
||||
@ -286,7 +286,7 @@ typedef struct dnode {
|
||||
|
||||
/* holds prefetch structure */
|
||||
struct zfetch dn_zfetch;
|
||||
} dnode_t;
|
||||
};
|
||||
|
||||
/*
|
||||
* Adds a level of indirection between the dbuf and the dnode to avoid
|
||||
|
@ -236,7 +236,14 @@ int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
|
||||
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
||||
int key_numints);
|
||||
|
||||
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
|
||||
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
|
||||
uint64_t integer_size, uint64_t num_integers, void *buf);
|
||||
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
|
||||
uint64_t integer_size, uint64_t num_integers, void *buf,
|
||||
matchtype_t mt, char *realname, int rn_len,
|
||||
boolean_t *ncp);
|
||||
|
||||
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
|
||||
int add, uint64_t *towrite, uint64_t *tooverwrite);
|
||||
|
||||
/*
|
||||
|
@ -21,7 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
||||
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
||||
*/
|
||||
@ -2843,6 +2843,21 @@ dmu_buf_get_objset(dmu_buf_t *db)
|
||||
return (dbi->db_objset);
|
||||
}
|
||||
|
||||
dnode_t *
|
||||
dmu_buf_dnode_enter(dmu_buf_t *db)
|
||||
{
|
||||
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
||||
DB_DNODE_ENTER(dbi);
|
||||
return (DB_DNODE(dbi));
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_dnode_exit(dmu_buf_t *db)
|
||||
{
|
||||
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
||||
DB_DNODE_EXIT(dbi);
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
||||
{
|
||||
|
@ -127,6 +127,26 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
|
||||
{ zfs_acl_byteswap, "acl" }
|
||||
};
|
||||
|
||||
int
|
||||
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
|
||||
void *tag, dmu_buf_t **dbp)
|
||||
{
|
||||
uint64_t blkid;
|
||||
dmu_buf_impl_t *db;
|
||||
|
||||
blkid = dbuf_whichblock(dn, 0, offset);
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
db = dbuf_hold(dn, blkid, tag);
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
if (db == NULL) {
|
||||
*dbp = NULL;
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
*dbp = &db->db;
|
||||
return (0);
|
||||
}
|
||||
int
|
||||
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
|
||||
void *tag, dmu_buf_t **dbp)
|
||||
@ -154,6 +174,29 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
|
||||
void *tag, dmu_buf_t **dbp, int flags)
|
||||
{
|
||||
int err;
|
||||
int db_flags = DB_RF_CANFAIL;
|
||||
|
||||
if (flags & DMU_READ_NO_PREFETCH)
|
||||
db_flags |= DB_RF_NOPREFETCH;
|
||||
|
||||
err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
|
||||
if (err == 0) {
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
|
||||
err = dbuf_read(db, NULL, db_flags);
|
||||
if (err != 0) {
|
||||
dbuf_rele(db, tag);
|
||||
*dbp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
|
||||
void *tag, dmu_buf_t **dbp, int flags)
|
||||
|
@ -21,7 +21,7 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <sys/dmu.h>
|
||||
@ -794,15 +794,14 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
|
||||
* access the name in this fat-zap so that we'll check
|
||||
* for i/o errors to the leaf blocks, etc.
|
||||
*/
|
||||
err = zap_lookup(dn->dn_objset, dn->dn_object, name,
|
||||
8, 0, NULL);
|
||||
err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
|
||||
if (err == EIO) {
|
||||
tx->tx_err = err;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
|
||||
err = zap_count_write_by_dnode(dn, name, add,
|
||||
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
|
||||
|
||||
/*
|
||||
|
@ -270,6 +270,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
uint64_t blk, off;
|
||||
int err;
|
||||
dmu_buf_t *db;
|
||||
dnode_t *dn;
|
||||
int bs = FZAP_BLOCK_SHIFT(zap);
|
||||
|
||||
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
|
||||
@ -277,8 +278,15 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
blk = idx >> (bs-3);
|
||||
off = idx & ((1<<(bs-3))-1);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
/*
|
||||
* Note: this is equivalent to dmu_buf_hold(), but we use
|
||||
* _dnode_enter / _by_dnode because it's faster because we don't
|
||||
* have to hold the dnode.
|
||||
*/
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err)
|
||||
return (err);
|
||||
*valp = ((uint64_t *)db->db_data)[off];
|
||||
@ -292,9 +300,11 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
|
||||
*/
|
||||
blk = (idx*2) >> (bs-3);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
|
||||
DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err == 0)
|
||||
dmu_buf_rele(db, FTAG);
|
||||
}
|
||||
@ -502,6 +512,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
|
||||
zap_leaf_t *l;
|
||||
int bs = FZAP_BLOCK_SHIFT(zap);
|
||||
int err;
|
||||
dnode_t *dn;
|
||||
|
||||
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
|
||||
|
||||
@ -515,8 +526,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
|
||||
if (blkid == 0)
|
||||
return (ENOENT);
|
||||
|
||||
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
|
||||
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
|
||||
err = dmu_buf_hold_by_dnode(dn,
|
||||
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
|
||||
dmu_buf_dnode_exit(zap->zap_dbuf);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
|
@ -536,6 +536,24 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
|
||||
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
|
||||
{
|
||||
dmu_buf_t *db;
|
||||
int err;
|
||||
|
||||
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
|
||||
if (err != 0) {
|
||||
return (err);
|
||||
}
|
||||
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
|
||||
if (err != 0) {
|
||||
dmu_buf_rele(db, tag);
|
||||
}
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
|
||||
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
|
||||
@ -927,6 +945,33 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
zap_lookup_by_dnode(dnode_t *dn, const char *name,
|
||||
uint64_t integer_size, uint64_t num_integers, void *buf)
|
||||
{
|
||||
return (zap_lookup_norm_by_dnode(dn, name, integer_size,
|
||||
num_integers, buf, MT_EXACT, NULL, 0, NULL));
|
||||
}
|
||||
|
||||
int
|
||||
zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
|
||||
uint64_t integer_size, uint64_t num_integers, void *buf,
|
||||
matchtype_t mt, char *realname, int rn_len,
|
||||
boolean_t *ncp)
|
||||
{
|
||||
zap_t *zap;
|
||||
int err;
|
||||
|
||||
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
|
||||
FTAG, &zap);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
err = zap_lookup_impl(zap, name, integer_size,
|
||||
num_integers, buf, mt, realname, rn_len, ncp);
|
||||
zap_unlockdir(zap, FTAG);
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
||||
int key_numints)
|
||||
@ -1460,7 +1505,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
|
||||
}
|
||||
|
||||
int
|
||||
zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
|
||||
zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
|
||||
uint64_t *towrite, uint64_t *tooverwrite)
|
||||
{
|
||||
zap_t *zap;
|
||||
@ -1488,7 +1533,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
|
||||
* At present we are just evaluating the possibility of this operation
|
||||
* and hence we do not want to trigger an upgrade.
|
||||
*/
|
||||
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE,
|
||||
err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
|
||||
FTAG, &zap);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
@ -1552,7 +1597,7 @@ EXPORT_SYMBOL(zap_lookup_uint64);
|
||||
EXPORT_SYMBOL(zap_contains);
|
||||
EXPORT_SYMBOL(zap_prefetch);
|
||||
EXPORT_SYMBOL(zap_prefetch_uint64);
|
||||
EXPORT_SYMBOL(zap_count_write);
|
||||
EXPORT_SYMBOL(zap_count_write_by_dnode);
|
||||
EXPORT_SYMBOL(zap_add);
|
||||
EXPORT_SYMBOL(zap_add_uint64);
|
||||
EXPORT_SYMBOL(zap_update);
|
||||
|
Loading…
Reference in New Issue
Block a user