mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 02:14:28 +03:00
dbuf_hold_impl() cleanup to improve cached read performance
Currently every dbuf_hold_impl() incurs kmem_alloc() and kmem_free()
which can be costly for cached read performance.
This change reverts the dbuf_hold_impl() fix stack commit, i.e.
fc5bb51f08
to eliminate the extra
kmem_alloc() and kmem_free() operations and improve cached read
performance. With the change, each dbuf_hold_impl() frame uses 40 bytes
more, total of 800 for 20 recursive levels. Linux kernel stack sizes are
8K and 16K for 32bit and 64bit, respectively, so stack overrun risk is
limited.
Sample stack output comparisons with 50 PB file and recordsize=512
Current code
11) 2240 64 arc_alloc_buf+0x4a/0xd0 [zfs]
12) 2176 264 dbuf_read_impl.constprop.16+0x2e3/0x7f0 [zfs]
13) 1912 120 dbuf_read+0xe5/0x520 [zfs]
14) 1792 56 dbuf_hold_impl_arg+0x572/0x630 [zfs]
15) 1736 64 dbuf_hold_impl_arg+0x508/0x630 [zfs]
16) 1672 64 dbuf_hold_impl_arg+0x508/0x630 [zfs]
17) 1608 40 dbuf_hold_impl+0x23/0x40 [zfs]
18) 1568 40 dbuf_hold_level+0x32/0x60 [zfs]
19) 1528 16 dbuf_hold+0x16/0x20 [zfs]
dbuf_hold_impl() cleanup
11) 2320 64 arc_alloc_buf+0x4a/0xd0 [zfs]
12) 2256 264 dbuf_read_impl.constprop.17+0x2e3/0x7f0 [zfs]
13) 1992 120 dbuf_read+0xe5/0x520 [zfs]
14) 1872 96 dbuf_hold_impl+0x50f/0x5e0 [zfs]
15) 1776 104 dbuf_hold_impl+0x4df/0x5e0 [zfs]
16) 1672 104 dbuf_hold_impl+0x4df/0x5e0 [zfs]
17) 1568 40 dbuf_hold_level+0x32/0x60 [zfs]
18) 1528 16 dbuf_hold+0x16/0x20 [zfs]
Performance observations on 8K recordsize filesystem:
- 8/128/1024K at 1-128 sequential cached read, ~3% improvement
Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on
VMware ESX.
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Nguyen <tony.nguyen@delphix.com>
Closes #9351
This commit is contained in:
parent
73cdcc6323
commit
64b6c47d90
@ -148,29 +148,6 @@ dbuf_stats_t dbuf_stats = {
|
||||
continue; \
|
||||
}
|
||||
|
||||
typedef struct dbuf_hold_arg {
|
||||
/* Function arguments */
|
||||
dnode_t *dh_dn;
|
||||
uint8_t dh_level;
|
||||
uint64_t dh_blkid;
|
||||
boolean_t dh_fail_sparse;
|
||||
boolean_t dh_fail_uncached;
|
||||
void *dh_tag;
|
||||
dmu_buf_impl_t **dh_dbp;
|
||||
/* Local variables */
|
||||
dmu_buf_impl_t *dh_db;
|
||||
dmu_buf_impl_t *dh_parent;
|
||||
blkptr_t *dh_bp;
|
||||
int dh_err;
|
||||
dbuf_dirty_record_t *dh_dr;
|
||||
} dbuf_hold_arg_t;
|
||||
|
||||
static dbuf_hold_arg_t *dbuf_hold_arg_create(dnode_t *dn, uint8_t level,
|
||||
uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached,
|
||||
void *tag, dmu_buf_impl_t **dbp);
|
||||
static int dbuf_hold_impl_arg(dbuf_hold_arg_t *dh);
|
||||
static void dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh);
|
||||
|
||||
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||
|
||||
@ -2805,10 +2782,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
||||
} else if (level < nlevels-1) {
|
||||
/* this block is referenced from an indirect block */
|
||||
int err;
|
||||
dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level + 1,
|
||||
|
||||
err = dbuf_hold_impl(dn, level + 1,
|
||||
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
|
||||
err = dbuf_hold_impl_arg(dh);
|
||||
dbuf_hold_arg_destroy(dh);
|
||||
|
||||
if (err)
|
||||
return (err);
|
||||
err = dbuf_read(*parentp, NULL,
|
||||
@ -3228,24 +3205,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
||||
zio_nowait(pio);
|
||||
}
|
||||
|
||||
#define DBUF_HOLD_IMPL_MAX_DEPTH 20
|
||||
|
||||
/*
|
||||
* Helper function for dbuf_hold_impl_arg() to copy a buffer. Handles
|
||||
* Helper function for dbuf_hold_impl() to copy a buffer. Handles
|
||||
* the case of encrypted, compressed and uncompressed buffers by
|
||||
* allocating the new buffer, respectively, with arc_alloc_raw_buf(),
|
||||
* arc_alloc_compressed_buf() or arc_alloc_buf().*
|
||||
*
|
||||
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl_arg().
|
||||
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
|
||||
*/
|
||||
noinline static void
|
||||
dbuf_hold_copy(struct dbuf_hold_arg *dh)
|
||||
dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
|
||||
{
|
||||
dnode_t *dn = dh->dh_dn;
|
||||
dmu_buf_impl_t *db = dh->dh_db;
|
||||
dbuf_dirty_record_t *dr = dh->dh_dr;
|
||||
dbuf_dirty_record_t *dr = db->db_data_pending;
|
||||
arc_buf_t *data = dr->dt.dl.dr_data;
|
||||
|
||||
enum zio_compress compress_type = arc_get_compression(data);
|
||||
|
||||
if (arc_is_encrypted(data)) {
|
||||
@ -3277,170 +3249,113 @@ dbuf_hold_copy(struct dbuf_hold_arg *dh)
|
||||
* Returns with db_holds incremented, and db_mtx not held.
|
||||
* Note: dn_struct_rwlock must be held.
|
||||
*/
|
||||
static int
|
||||
dbuf_hold_impl_arg(struct dbuf_hold_arg *dh)
|
||||
int
|
||||
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||
void *tag, dmu_buf_impl_t **dbp)
|
||||
{
|
||||
dh->dh_parent = NULL;
|
||||
|
||||
ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
|
||||
ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
|
||||
|
||||
*(dh->dh_dbp) = NULL;
|
||||
dmu_buf_impl_t *db, *parent = NULL;
|
||||
|
||||
/* If the pool has been created, verify the tx_sync_lock is not held */
|
||||
spa_t *spa = dh->dh_dn->dn_objset->os_spa;
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||
if (dp != NULL) {
|
||||
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
|
||||
}
|
||||
|
||||
ASSERT(blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
||||
ASSERT3U(dn->dn_nlevels, >, level);
|
||||
|
||||
*dbp = NULL;
|
||||
|
||||
/* dbuf_find() returns with db_mtx held */
|
||||
dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
|
||||
dh->dh_level, dh->dh_blkid);
|
||||
db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
|
||||
|
||||
if (dh->dh_db == NULL) {
|
||||
dh->dh_bp = NULL;
|
||||
if (db == NULL) {
|
||||
blkptr_t *bp = NULL;
|
||||
int err;
|
||||
|
||||
if (dh->dh_fail_uncached)
|
||||
if (fail_uncached)
|
||||
return (SET_ERROR(ENOENT));
|
||||
|
||||
ASSERT3P(dh->dh_parent, ==, NULL);
|
||||
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
||||
dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp);
|
||||
if (dh->dh_fail_sparse) {
|
||||
if (dh->dh_err == 0 &&
|
||||
dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
|
||||
dh->dh_err = SET_ERROR(ENOENT);
|
||||
if (dh->dh_err) {
|
||||
if (dh->dh_parent)
|
||||
dbuf_rele(dh->dh_parent, NULL);
|
||||
return (dh->dh_err);
|
||||
ASSERT3P(parent, ==, NULL);
|
||||
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
|
||||
if (fail_sparse) {
|
||||
if (err == 0 && bp && BP_IS_HOLE(bp))
|
||||
err = SET_ERROR(ENOENT);
|
||||
if (err) {
|
||||
if (parent)
|
||||
dbuf_rele(parent, NULL);
|
||||
return (err);
|
||||
}
|
||||
}
|
||||
if (dh->dh_err && dh->dh_err != ENOENT)
|
||||
return (dh->dh_err);
|
||||
dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
|
||||
dh->dh_parent, dh->dh_bp);
|
||||
if (err && err != ENOENT)
|
||||
return (err);
|
||||
db = dbuf_create(dn, level, blkid, parent, bp);
|
||||
}
|
||||
|
||||
if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
|
||||
mutex_exit(&dh->dh_db->db_mtx);
|
||||
if (fail_uncached && db->db_state != DB_CACHED) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
if (dh->dh_db->db_buf != NULL) {
|
||||
arc_buf_access(dh->dh_db->db_buf);
|
||||
ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
|
||||
if (db->db_buf != NULL) {
|
||||
arc_buf_access(db->db_buf);
|
||||
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
|
||||
}
|
||||
|
||||
ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
|
||||
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
|
||||
|
||||
/*
|
||||
* If this buffer is currently syncing out, and we are
|
||||
* still referencing it from db_data, we need to make a copy
|
||||
* of it in case we decide we want to dirty it again in this txg.
|
||||
*/
|
||||
if (dh->dh_db->db_level == 0 &&
|
||||
dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
|
||||
dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
|
||||
dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
|
||||
dh->dh_dr = dh->dh_db->db_data_pending;
|
||||
if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
|
||||
dbuf_hold_copy(dh);
|
||||
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
dn->dn_object != DMU_META_DNODE_OBJECT &&
|
||||
db->db_state == DB_CACHED && db->db_data_pending) {
|
||||
dbuf_dirty_record_t *dr = db->db_data_pending;
|
||||
if (dr->dt.dl.dr_data == db->db_buf)
|
||||
dbuf_hold_copy(dn, db);
|
||||
}
|
||||
|
||||
if (multilist_link_active(&dh->dh_db->db_cache_link)) {
|
||||
ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
|
||||
ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
|
||||
dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
|
||||
if (multilist_link_active(&db->db_cache_link)) {
|
||||
ASSERT(zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
|
||||
db->db_caching_status == DB_DBUF_METADATA_CACHE);
|
||||
|
||||
multilist_remove(
|
||||
dbuf_caches[dh->dh_db->db_caching_status].cache,
|
||||
dh->dh_db);
|
||||
multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
|
||||
(void) zfs_refcount_remove_many(
|
||||
&dbuf_caches[dh->dh_db->db_caching_status].size,
|
||||
dh->dh_db->db.db_size, dh->dh_db);
|
||||
&dbuf_caches[db->db_caching_status].size,
|
||||
db->db.db_size, db);
|
||||
|
||||
if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
|
||||
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
|
||||
DBUF_STAT_BUMPDOWN(metadata_cache_count);
|
||||
} else {
|
||||
DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
|
||||
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
|
||||
DBUF_STAT_BUMPDOWN(cache_count);
|
||||
DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
|
||||
dh->dh_db->db.db_size);
|
||||
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
|
||||
db->db.db_size);
|
||||
}
|
||||
dh->dh_db->db_caching_status = DB_NO_CACHE;
|
||||
db->db_caching_status = DB_NO_CACHE;
|
||||
}
|
||||
(void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
|
||||
DBUF_VERIFY(dh->dh_db);
|
||||
mutex_exit(&dh->dh_db->db_mtx);
|
||||
(void) zfs_refcount_add(&db->db_holds, tag);
|
||||
DBUF_VERIFY(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
/* NOTE: we can't rele the parent until after we drop the db_mtx */
|
||||
if (dh->dh_parent)
|
||||
dbuf_rele(dh->dh_parent, NULL);
|
||||
if (parent)
|
||||
dbuf_rele(parent, NULL);
|
||||
|
||||
ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
|
||||
ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
|
||||
ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
|
||||
*(dh->dh_dbp) = dh->dh_db;
|
||||
ASSERT3P(DB_DNODE(db), ==, dn);
|
||||
ASSERT3U(db->db_blkid, ==, blkid);
|
||||
ASSERT3U(db->db_level, ==, level);
|
||||
*dbp = db;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* dbuf_hold_impl_arg() is called recursively, via dbuf_findbp(). There can
|
||||
* be as many recursive calls as there are levels of on-disk indirect blocks,
|
||||
* but typically only 0-2 recursive calls. To minimize the stack frame size,
|
||||
* the recursive function's arguments and "local variables" are allocated on
|
||||
* the heap as the dbuf_hold_arg_t.
|
||||
*/
|
||||
int
|
||||
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||
void *tag, dmu_buf_impl_t **dbp)
|
||||
{
|
||||
dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level, blkid,
|
||||
fail_sparse, fail_uncached, tag, dbp);
|
||||
|
||||
int error = dbuf_hold_impl_arg(dh);
|
||||
|
||||
dbuf_hold_arg_destroy(dh);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
static dbuf_hold_arg_t *
|
||||
dbuf_hold_arg_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
||||
boolean_t fail_sparse, boolean_t fail_uncached,
|
||||
void *tag, dmu_buf_impl_t **dbp)
|
||||
{
|
||||
dbuf_hold_arg_t *dh = kmem_alloc(sizeof (*dh), KM_SLEEP);
|
||||
dh->dh_dn = dn;
|
||||
dh->dh_level = level;
|
||||
dh->dh_blkid = blkid;
|
||||
|
||||
dh->dh_fail_sparse = fail_sparse;
|
||||
dh->dh_fail_uncached = fail_uncached;
|
||||
|
||||
dh->dh_tag = tag;
|
||||
dh->dh_dbp = dbp;
|
||||
|
||||
dh->dh_db = NULL;
|
||||
dh->dh_parent = NULL;
|
||||
dh->dh_bp = NULL;
|
||||
dh->dh_err = 0;
|
||||
dh->dh_dr = NULL;
|
||||
|
||||
return (dh);
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh)
|
||||
{
|
||||
kmem_free(dh, sizeof (*dh));
|
||||
}
|
||||
|
||||
dmu_buf_impl_t *
|
||||
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user