Decrease contention on dn_struct_rwlock

Currently, sequential async write workloads spend a lot of time 
contending on the dn_struct_rwlock. This lock is responsible for 
protecting the entire block tree below it; this naturally results 
in some serialization during heavy write workloads. This can be 
resolved by having per-dbuf locking, which will allow multiple 
writers in the same object at the same time.

We introduce a new rwlock, the db_rwlock. This lock is responsible 
for protecting the contents of the dbuf that it is a part of; when 
reading a block pointer from a dbuf, you hold the lock as a reader. 
When writing data to a dbuf, you hold it as a writer. This allows 
multiple threads to write to different parts of a file at the same 
time.

Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens matt@delphix.com
Reviewed by: George Wilson george.wilson@delphix.com
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
External-issue: DLPX-52564
External-issue: DLPX-53085
External-issue: DLPX-57384
Closes #8946
This commit is contained in:
Paul Dagnelie 2019-07-08 13:18:50 -07:00 committed by Brian Behlendorf
parent cb70964221
commit f664f1ee7f
7 changed files with 247 additions and 120 deletions

View File

@ -108,6 +108,12 @@ typedef enum override_states {
DR_OVERRIDDEN DR_OVERRIDDEN
} override_states_t; } override_states_t;
typedef enum db_lock_type {
DLT_NONE,
DLT_PARENT,
DLT_OBJSET
} db_lock_type_t;
typedef struct dbuf_dirty_record { typedef struct dbuf_dirty_record {
/* link on our parents dirty list */ /* link on our parents dirty list */
list_node_t dr_dirty_node; list_node_t dr_dirty_node;
@ -217,6 +223,22 @@ typedef struct dmu_buf_impl {
*/ */
uint8_t db_level; uint8_t db_level;
/*
* Protects db_buf's contents if they contain an indirect block or data
* block of the meta-dnode. We use this lock to protect the structure of
* the block tree. This means that when modifying this dbuf's data, we
* grab its rwlock. When modifying its parent's data (including the
* blkptr to this dbuf), we grab the parent's rwlock. The lock ordering
* for this lock is:
* 1) dn_struct_rwlock
* 2) db_rwlock
* We don't currently grab multiple dbufs' db_rwlocks at once.
*/
krwlock_t db_rwlock;
/* buffer holding our data */
arc_buf_t *db_buf;
/* db_mtx protects the members below */ /* db_mtx protects the members below */
kmutex_t db_mtx; kmutex_t db_mtx;
@ -232,9 +254,6 @@ typedef struct dmu_buf_impl {
*/ */
zfs_refcount_t db_holds; zfs_refcount_t db_holds;
/* buffer holding our data */
arc_buf_t *db_buf;
kcondvar_t db_changed; kcondvar_t db_changed;
dbuf_dirty_record_t *db_data_pending; dbuf_dirty_record_t *db_data_pending;
@ -335,6 +354,8 @@ void dbuf_destroy(dmu_buf_impl_t *db);
void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_release_bp(dmu_buf_impl_t *db);
db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag);
void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *); struct dmu_tx *);

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2014 by Delphix. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/ */
#ifndef _DMU_ZFETCH_H #ifndef _DMU_ZFETCH_H
@ -66,7 +66,8 @@ void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *); void dmu_zfetch_fini(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/ */
@ -287,6 +287,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
bzero(db, sizeof (dmu_buf_impl_t)); bzero(db, sizeof (dmu_buf_impl_t));
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link); multilist_link_init(&db->db_cache_link);
zfs_refcount_create(&db->db_holds); zfs_refcount_create(&db->db_holds);
@ -300,6 +301,7 @@ dbuf_dest(void *vdb, void *unused)
{ {
dmu_buf_impl_t *db = vdb; dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx); mutex_destroy(&db->db_mtx);
rw_destroy(&db->db_rwlock);
cv_destroy(&db->db_changed); cv_destroy(&db->db_changed);
ASSERT(!multilist_link_active(&db->db_cache_link)); ASSERT(!multilist_link_active(&db->db_cache_link));
zfs_refcount_destroy(&db->db_holds); zfs_refcount_destroy(&db->db_holds);
@ -1014,10 +1016,10 @@ dbuf_verify(dmu_buf_impl_t *db)
db->db.db_object); db->db.db_object);
/* /*
* dnode_grow_indblksz() can make this fail if we don't * dnode_grow_indblksz() can make this fail if we don't
* have the struct_rwlock. XXX indblksz no longer * have the parent's rwlock. XXX indblksz no longer
* grows. safe to do this now? * grows. safe to do this now?
*/ */
if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
ASSERT3P(db->db_blkptr, ==, ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data + ((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb)); db->db_blkid % epb));
@ -1177,6 +1179,44 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
} }
} }
/*
* This function is used to lock the parent of the provided dbuf. This should be
* used when modifying or reading db_blkptr.
*/
db_lock_type_t
dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
{
enum db_lock_type ret = DLT_NONE;
if (db->db_parent != NULL) {
rw_enter(&db->db_parent->db_rwlock, rw);
ret = DLT_PARENT;
} else if (dmu_objset_ds(db->db_objset) != NULL) {
rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
tag);
ret = DLT_OBJSET;
}
/*
* We only return a DLT_NONE lock when it's the top-most indirect block
* of the meta-dnode of the MOS.
*/
return (ret);
}
/*
* We need to pass the lock type in because it's possible that the block will
* move from being the topmost indirect block in a dnode (and thus, have no
* parent) to not the top-most via an indirection increase. This would cause a
* panic if we didn't pass the lock type in.
*/
void
dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
{
if (type == DLT_PARENT)
rw_exit(&db->db_parent->db_rwlock);
else if (type == DLT_OBJSET)
rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
}
static void static void
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *vdb) arc_buf_t *buf, void *vdb)
@ -1273,8 +1313,13 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
return (err); return (err);
} }
/*
* Drops db_mtx and the parent lock specified by dblt and tag before
* returning.
*/
static int static int
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
db_lock_type_t dblt, void *tag)
{ {
dnode_t *dn; dnode_t *dn;
zbookmark_phys_t zb; zbookmark_phys_t zb;
@ -1284,11 +1329,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(!zfs_refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL); ASSERT(db->db_buf == NULL);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) { if (db->db_blkid == DMU_BONUS_BLKID) {
/* /*
@ -1316,6 +1361,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dmu_buf_unlock_parent(db, dblt, tag);
return (0); return (0);
} }
@ -1356,6 +1402,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dmu_buf_unlock_parent(db, dblt, tag);
return (0); return (0);
} }
@ -1387,12 +1434,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
"object set %llu", dmu_objset_id(db->db_objset)); "object set %llu", dmu_objset_id(db->db_objset));
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dmu_buf_unlock_parent(db, dblt, tag);
return (SET_ERROR(EIO)); return (SET_ERROR(EIO));
} }
err = dbuf_read_verify_dnode_crypt(db, flags); err = dbuf_read_verify_dnode_crypt(db, flags);
if (err != 0) { if (err != 0) {
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
dmu_buf_unlock_parent(db, dblt, tag);
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
return (err); return (err);
} }
@ -1412,11 +1461,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW; zio_flags |= ZIO_FLAG_RAW;
/*
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, * The zio layer will copy the provided blkptr later, but we need to
* do this now so that we can release the parent's rwlock. We have to
* do that now so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
blkptr_t bp = *db->db_blkptr;
dmu_buf_unlock_parent(db, dblt, tag);
(void) arc_read(zio, db->db_objset->os_spa, &bp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb); &aflags, &zb);
return (err); return (err);
} }
@ -1514,8 +1570,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
@ -1552,30 +1606,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dbuf_set_data(db, db->db_buf); dbuf_set_data(db, db->db_buf);
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
if ((flags & DB_RF_HAVESTRUCT) == 0) flags & DB_RF_HAVESTRUCT);
rw_exit(&dn->dn_struct_rwlock); }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits); DBUF_STAT_BUMP(hash_hits);
} else if (db->db_state == DB_UNCACHED) { } else if (db->db_state == DB_UNCACHED) {
spa_t *spa = dn->dn_objset->os_spa; spa_t *spa = dn->dn_objset->os_spa;
boolean_t need_wait = B_FALSE; boolean_t need_wait = B_FALSE;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
if (zio == NULL && if (zio == NULL &&
db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE; need_wait = B_TRUE;
} }
err = dbuf_read_impl(db, zio, flags); err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
/*
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
* for us
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
}
/* dbuf_read_impl has dropped db_mtx for us */
if (!err && prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses); DBUF_STAT_BUMP(hash_misses);
@ -1600,10 +1657,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
* occurred and the dbuf went to UNCACHED. * occurred and the dbuf went to UNCACHED.
*/ */
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (prefetch) if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
if ((flags & DB_RF_HAVESTRUCT) == 0) flags & DB_RF_HAVESTRUCT);
rw_exit(&dn->dn_struct_rwlock); }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses); DBUF_STAT_BUMP(hash_misses);
@ -1785,7 +1842,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
if (db->db_state == DB_CACHED) { if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL); ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db); arc_release(db->db_buf, db);
rw_enter(&db->db_rwlock, RW_WRITER);
bzero(db->db.db_data, db->db.db_size); bzero(db->db.db_data, db->db.db_size);
rw_exit(&db->db_rwlock);
arc_buf_freeze(db->db_buf); arc_buf_freeze(db->db_buf);
} }
@ -1809,15 +1868,6 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
DB_DNODE_ENTER(db); DB_DNODE_ENTER(db);
dn = DB_DNODE(db); dn = DB_DNODE(db);
/* XXX does *this* func really need the lock? */
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
* This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
*/
/* /*
* XXX we should be doing a dbuf_read, checking the return * XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers * value and returning that up to our callers
@ -1894,8 +1944,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dnode_t *dn; dnode_t *dn;
objset_t *os; objset_t *os;
dbuf_dirty_record_t **drp, *dr; dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
int txgoff = tx->tx_txg & TXG_MASK; int txgoff = tx->tx_txg & TXG_MASK;
boolean_t drop_struct_rwlock = B_FALSE;
ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(!zfs_refcount_is_zero(&db->db_holds));
@ -2098,15 +2148,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (dr); return (dr);
} }
/*
* The dn_struct_rwlock prevents db_blkptr from changing
* due to a write from syncing context completing
* while we are running, so we want to acquire it before
* looking at db_blkptr.
*/
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
drop_struct_lock = TRUE; drop_struct_rwlock = B_TRUE;
}
/*
* If we are overwriting a dedup BP, then unless it is snapshotted,
* when we get to syncing context we will need to decrement its
* refcount in the DDT. Prefetch the relevant DDT block so that
* syncing context won't have to wait for the i/o.
*/
if (db->db_blkptr != NULL) {
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
ddt_prefetch(os->os_spa, db->db_blkptr);
dmu_buf_unlock_parent(db, dblt, FTAG);
} }
/* /*
@ -2119,19 +2175,12 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
/*
* If we are overwriting a dedup BP, then unless it is snapshotted,
* when we get to syncing context we will need to decrement its
* refcount in the DDT. Prefetch the relevant DDT block so that
* syncing context won't have to wait for the i/o.
*/
ddt_prefetch(os->os_spa, db->db_blkptr);
if (db->db_level == 0) { if (db->db_level == 0) {
ASSERT(!db->db_objset->os_raw_receive || ASSERT(!db->db_objset->os_raw_receive ||
dn->dn_maxblkid >= db->db_blkid); dn->dn_maxblkid >= db->db_blkid);
dnode_new_blkid(dn, db->db_blkid, tx, dnode_new_blkid(dn, db->db_blkid, tx,
drop_struct_lock, B_FALSE); drop_struct_rwlock, B_FALSE);
ASSERT(dn->dn_maxblkid >= db->db_blkid); ASSERT(dn->dn_maxblkid >= db->db_blkid);
} }
@ -2142,15 +2191,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
parent = dbuf_hold_level(dn, db->db_level + 1,
parent = dbuf_hold_level(dn, db->db_level+1,
db->db_blkid >> epbs, FTAG); db->db_blkid >> epbs, FTAG);
ASSERT(parent != NULL); ASSERT(parent != NULL);
parent_held = TRUE; parent_held = TRUE;
} }
if (drop_struct_lock) if (drop_struct_rwlock)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
ASSERT3U(db->db_level+1, ==, parent->db_level); ASSERT3U(db->db_level + 1, ==, parent->db_level);
di = dbuf_dirty(parent, tx); di = dbuf_dirty(parent, tx);
if (parent_held) if (parent_held)
dbuf_rele(parent, FTAG); dbuf_rele(parent, FTAG);
@ -2171,14 +2219,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} }
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
} else { } else {
ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_level + 1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_blkid < dn->dn_nblkptr);
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx); mutex_exit(&dn->dn_mtx);
if (drop_struct_lock) if (drop_struct_rwlock)
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
} }
@ -2767,10 +2815,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL; *parentp = NULL;
return (err); return (err);
} }
rw_enter(&(*parentp)->db_rwlock, RW_READER);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) + *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1)); (blkid & ((1ULL << epbs) - 1));
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
ASSERT(BP_IS_HOLE(*bpp)); ASSERT(BP_IS_HOLE(*bpp));
rw_exit(&(*parentp)->db_rwlock);
return (0); return (0);
} else { } else {
/* the block is referenced from the dnode */ /* the block is referenced from the dnode */
@ -3060,7 +3110,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
if (blkid > dn->dn_maxblkid) if (blkid > dn->dn_maxblkid)
return; return;
if (dnode_block_freed(dn, blkid)) if (level == 0 && dnode_block_freed(dn, blkid))
return; return;
/* /*
@ -3215,7 +3265,9 @@ dbuf_hold_copy(struct dbuf_hold_arg *dh)
DBUF_GET_BUFC_TYPE(db), db->db.db_size)); DBUF_GET_BUFC_TYPE(db), db->db.db_size));
} }
rw_enter(&db->db_rwlock, RW_WRITER);
bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
rw_exit(&db->db_rwlock);
} }
/* /*
@ -3406,7 +3458,6 @@ int
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
{ {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
if (db->db_blkid != DMU_SPILL_BLKID) if (db->db_blkid != DMU_SPILL_BLKID)
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
@ -3415,12 +3466,7 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dbuf_new_size(db, blksz, tx); dbuf_new_size(db, blksz, tx);
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
return (0); return (0);
} }
@ -4183,9 +4229,9 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
*db->db_blkptr = *bp; *db->db_blkptr = *bp;
rw_exit(&dn->dn_struct_rwlock); dmu_buf_unlock_parent(db, dblt, FTAG);
} }
/* ARGSUSED */ /* ARGSUSED */
@ -4226,9 +4272,9 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to * anybody from reading the blocks we're about to
* zero out. * zero out.
*/ */
rw_enter(&dn->dn_struct_rwlock, RW_WRITER); rw_enter(&db->db_rwlock, RW_WRITER);
bzero(db->db.db_data, db->db.db_size); bzero(db->db.db_data, db->db.db_size);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&db->db_rwlock);
} }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
} }
@ -4419,7 +4465,7 @@ dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
} }
static void static void
dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
{ {
blkptr_t bp_copy = *bp; blkptr_t bp_copy = *bp;
spa_t *spa = dmu_objset_spa(dn->dn_objset); spa_t *spa = dmu_objset_spa(dn->dn_objset);
@ -4433,14 +4479,16 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) { &drica)) {
/* /*
* The struct_rwlock prevents dbuf_read_impl() from * The db_rwlock prevents dbuf_read_impl() from
* dereferencing the BP while we are changing it. To * dereferencing the BP while we are changing it. To
* avoid lock contention, only grab it when we are actually * avoid lock contention, only grab it when we are actually
* changing the BP. * changing the BP.
*/ */
rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (rw != NULL)
rw_enter(rw, RW_WRITER);
*bp = bp_copy; *bp = bp_copy;
rw_exit(&dn->dn_struct_rwlock); if (rw != NULL)
rw_exit(rw);
} }
} }
@ -4459,7 +4507,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level > 0) { if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data; blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
dbuf_remap_impl(dn, &bp[i], tx); dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
} }
} else if (db->db.db_object == DMU_META_DNODE_OBJECT) { } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
dnode_phys_t *dnp = db->db.db_data; dnode_phys_t *dnp = db->db.db_data;
@ -4468,7 +4516,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
i += dnp[i].dn_extra_slots + 1) { i += dnp[i].dn_extra_slots + 1) {
for (int j = 0; j < dnp[i].dn_nblkptr; j++) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
&dn->dn_dbuf->db_rwlock);
dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
tx);
} }
} }
} }

View File

@ -158,8 +158,8 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t blkid; uint64_t blkid;
dmu_buf_impl_t *db; dmu_buf_impl_t *db;
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, tag); db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -183,8 +183,8 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
err = dnode_hold(os, object, FTAG, &dn); err = dnode_hold(os, object, FTAG, &dn);
if (err) if (err)
return (err); return (err);
blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, tag); db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -549,7 +549,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
if ((flags & DMU_READ_NO_PREFETCH) == 0 && if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks, dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn)); read && DNODE_IS_CACHEABLE(dn), B_TRUE);
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -681,7 +681,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
if (err != 0) if (err != 0)
return; return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
/* /*
* offset + len - 1 is the last byte we want to prefetch for, and offset * offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
@ -689,6 +688,7 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
* offset) is the first. Then the number we need to prefetch is the * offset) is the first. Then the number we need to prefetch is the
* last - first + 1. * last - first + 1.
*/ */
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (level > 0 || dn->dn_datablkshift != 0) { if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) - nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1; dbuf_whichblock(dn, level, offset) + 1;
@ -701,7 +701,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
for (int i = 0; i < nblks; i++) for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0); dbuf_prefetch(dn, level, blkid + i, pri, 0);
} }
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -205,7 +205,8 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
* TRUE -- prefetch predicted data blocks plus following indirect blocks. * TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/ */
void void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
boolean_t have_lock)
{ {
zstream_t *zs; zstream_t *zs;
int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
@ -236,6 +237,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
return; return;
retry: retry:
if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
rw_enter(&zf->zf_rwlock, rw); rw_enter(&zf->zf_rwlock, rw);
/* /*
@ -260,6 +263,10 @@ retry:
/* Already prefetched this before. */ /* Already prefetched this before. */
mutex_exit(&zs->zs_lock); mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
if (!have_lock) {
rw_exit(&zf->zf_dnode->
dn_struct_rwlock);
}
return; return;
} }
break; break;
@ -276,12 +283,16 @@ retry:
ZFETCHSTAT_BUMP(zfetchstat_misses); ZFETCHSTAT_BUMP(zfetchstat_misses);
if (rw == RW_READER && !rw_tryupgrade(&zf->zf_rwlock)) { if (rw == RW_READER && !rw_tryupgrade(&zf->zf_rwlock)) {
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
rw = RW_WRITER; rw = RW_WRITER;
goto retry; goto retry;
} }
dmu_zfetch_stream_create(zf, end_of_access_blkid); dmu_zfetch_stream_create(zf, end_of_access_blkid);
rw_exit(&zf->zf_rwlock); rw_exit(&zf->zf_rwlock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
return; return;
} }
@ -361,6 +372,8 @@ retry:
dbuf_prefetch(zf->zf_dnode, 1, iblk, dbuf_prefetch(zf->zf_dnode, 1, iblk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
} }
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
ZFETCHSTAT_BUMP(zfetchstat_hits); ZFETCHSTAT_BUMP(zfetchstat_hits);
} }

View File

@ -1331,7 +1331,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} }
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG); db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock) if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock); rw_exit(&mdn->dn_struct_rwlock);
@ -1742,10 +1741,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
/* resize the old block */ /* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0) if (err == 0) {
dbuf_new_size(db, size, tx); dbuf_new_size(db, size, tx);
else if (err != ENOENT) } else if (err != ENOENT) {
goto fail; goto fail;
}
dnode_setdblksz(dn, size); dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
@ -1983,7 +1983,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
int trunc = FALSE; int trunc = FALSE;
int epbs; int epbs;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz; blksz = dn->dn_datablksz;
blkshift = dn->dn_datablkshift; blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@ -2000,7 +1999,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
head = P2NPHASE(off, blksz); head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz); blkoff = P2PHASE(off, blksz);
if ((off >> blkshift) > dn->dn_maxblkid) if ((off >> blkshift) > dn->dn_maxblkid)
goto out; return;
} else { } else {
ASSERT(dn->dn_maxblkid == 0); ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) { if (off == 0 && len >= blksz) {
@ -2009,12 +2008,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
*/ */
blkid = 0; blkid = 0;
nblks = 1; nblks = 1;
if (dn->dn_nlevels > 1) if (dn->dn_nlevels > 1) {
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_dirty_l1(dn, 0, tx); dnode_dirty_l1(dn, 0, tx);
rw_exit(&dn->dn_struct_rwlock);
}
goto done; goto done;
} else if (off >= blksz) { } else if (off >= blksz) {
/* Freeing past end-of-data */ /* Freeing past end-of-data */
goto out; return;
} else { } else {
/* Freeing part of the block. */ /* Freeing part of the block. */
head = blksz - off; head = blksz - off;
@ -2024,19 +2026,26 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
} }
/* zero out any partial block data at the start of the range */ /* zero out any partial block data at the start of the range */
if (head) { if (head) {
int res;
ASSERT3U(blkoff + head, ==, blksz); ASSERT3U(blkoff + head, ==, blksz);
if (len < head) if (len < head)
head = len; head = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), rw_enter(&dn->dn_struct_rwlock, RW_READER);
TRUE, FALSE, FTAG, &db) == 0) { res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
TRUE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
if (res == 0) {
caddr_t data; caddr_t data;
boolean_t dirty;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
FTAG);
/* don't dirty if it isn't on disk and isn't dirty */ /* don't dirty if it isn't on disk and isn't dirty */
if (db->db_last_dirty || dirty = db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
rw_exit(&dn->dn_struct_rwlock); dmu_buf_unlock_parent(db, dblt, FTAG);
if (dirty) {
dmu_buf_will_dirty(&db->db, tx); dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data; data = db->db.db_data;
bzero(data + blkoff, head); bzero(data + blkoff, head);
} }
@ -2048,11 +2057,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range was less than one block, we're done */ /* If the range was less than one block, we're done */
if (len == 0) if (len == 0)
goto out; return;
/* If the remaining range is past end of file, we're done */ /* If the remaining range is past end of file, we're done */
if ((off >> blkshift) > dn->dn_maxblkid) if ((off >> blkshift) > dn->dn_maxblkid)
goto out; return;
ASSERT(ISP2(blksz)); ASSERT(ISP2(blksz));
if (trunc) if (trunc)
@ -2063,16 +2072,23 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT0(P2PHASE(off, blksz)); ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */ /* zero out any partial block data at the end of the range */
if (tail) { if (tail) {
int res;
if (len < tail) if (len < tail)
tail = len; tail = len;
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), rw_enter(&dn->dn_struct_rwlock, RW_READER);
TRUE, FALSE, FTAG, &db) == 0) { res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
TRUE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
if (res == 0) {
boolean_t dirty;
/* don't dirty if not on disk and not dirty */ /* don't dirty if not on disk and not dirty */
if (db->db_last_dirty || db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { FTAG);
rw_exit(&dn->dn_struct_rwlock); dirty = db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
dmu_buf_unlock_parent(db, type, FTAG);
if (dirty) {
dmu_buf_will_dirty(&db->db, tx); dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail); bzero(db->db.db_data, tail);
} }
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
@ -2082,7 +2098,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range did not include a full block, we are done */ /* If the range did not include a full block, we are done */
if (len == 0) if (len == 0)
goto out; return;
ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(IS_P2ALIGNED(off, blksz));
ASSERT(trunc || IS_P2ALIGNED(len, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz));
@ -2112,6 +2128,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
* amount of space if we copy the freed BPs into deadlists. * amount of space if we copy the freed BPs into deadlists.
*/ */
if (dn->dn_nlevels > 1) { if (dn->dn_nlevels > 1) {
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
uint64_t first, last; uint64_t first, last;
first = blkid >> epbs; first = blkid >> epbs;
@ -2156,6 +2173,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
dnode_dirty_l1(dn, i, tx); dnode_dirty_l1(dn, i, tx);
} }
rw_exit(&dn->dn_struct_rwlock);
} }
done: done:
@ -2178,9 +2196,6 @@ done:
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
out:
rw_exit(&dn->dn_struct_rwlock);
} }
static boolean_t static boolean_t
@ -2289,6 +2304,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
boolean_t hole; boolean_t hole;
int i, inc, error, span; int i, inc, error, span;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
hole = ((flags & DNODE_FIND_HOLE) != 0); hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole); ASSERT(txg == 0 || !hole);
@ -2321,9 +2338,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
return (error); return (error);
} }
data = db->db.db_data; data = db->db.db_data;
rw_enter(&db->db_rwlock, RW_READER);
} }
if (db != NULL && txg != 0 && (db->db_blkptr == NULL || if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg || db->db_blkptr->blk_birth <= txg ||
BP_IS_HOLE(db->db_blkptr))) { BP_IS_HOLE(db->db_blkptr))) {
@ -2396,8 +2413,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
} }
if (db) if (db != NULL) {
rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
}
return (error); return (error);
} }

View File

@ -51,7 +51,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* this dnode can't be paged out because it's dirty */ /* this dnode can't be paged out because it's dirty */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
@ -61,8 +60,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
dn->dn_object, dn->dn_phys->dn_nlevels); dn->dn_object, dn->dn_phys->dn_nlevels);
/*
* Lock ordering requires that we hold the children's db_mutexes (by
* calling dbuf_find()) before holding the parent's db_rwlock. The lock
* order is imposed by dbuf_read's steps of "grab the lock to protect
* db_parent, get db_parent, hold db_parent's db_rwlock".
*/
dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
for (i = 0; i < nblkptr; i++) {
children[i] =
dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
}
/* transfer dnode's block pointers to new indirect block */ /* transfer dnode's block pointers to new indirect block */
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
if (dn->dn_dbuf != NULL)
rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
rw_enter(&db->db_rwlock, RW_WRITER);
ASSERT(db->db.db_data); ASSERT(db->db.db_data);
ASSERT(arc_released(db->db_buf)); ASSERT(arc_released(db->db_buf));
ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
@ -72,8 +87,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* set dbuf's parent pointers to new indirect buf */ /* set dbuf's parent pointers to new indirect buf */
for (i = 0; i < nblkptr; i++) { for (i = 0; i < nblkptr; i++) {
dmu_buf_impl_t *child = dmu_buf_impl_t *child = children[i];
dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
if (child == NULL) if (child == NULL)
continue; continue;
@ -106,6 +120,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
rw_exit(&db->db_rwlock);
if (dn->dn_dbuf != NULL)
rw_exit(&dn->dn_dbuf->db_rwlock);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
@ -182,7 +200,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1); ASSERT(db->db_level == 1);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1, err = dbuf_hold_impl(dn, db->db_level - 1,
(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT) if (err == ENOENT)
@ -280,7 +298,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
* ancestor of the first or last block to be freed. The first and * ancestor of the first or last block to be freed. The first and
* last L1 indirect blocks are always dirtied by dnode_free_range(). * last L1 indirect blocks are always dirtied by dnode_free_range().
*/ */
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
dmu_buf_unlock_parent(db, dblt, FTAG);
dbuf_release_bp(db); dbuf_release_bp(db);
bp = db->db.db_data; bp = db->db.db_data;
@ -306,7 +326,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
if (db->db_level == 1) { if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx); FREE_VERIFY(db, start, end, tx);
free_blocks(dn, bp, end-start+1, tx); rw_enter(&db->db_rwlock, RW_WRITER);
free_blocks(dn, bp, end - start + 1, tx);
rw_exit(&db->db_rwlock);
} else { } else {
for (uint64_t id = start; id <= end; id++, bp++) { for (uint64_t id = start; id <= end; id++, bp++) {
if (BP_IS_HOLE(bp)) if (BP_IS_HOLE(bp))
@ -323,10 +345,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
} }
if (free_indirects) { if (free_indirects) {
rw_enter(&db->db_rwlock, RW_WRITER);
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size); bzero(db->db.db_data, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx); free_blocks(dn, db->db_blkptr, 1, tx);
rw_exit(&db->db_rwlock);
} }
DB_DNODE_EXIT(db); DB_DNODE_EXIT(db);
@ -378,7 +402,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FALSE, FTAG, &db)); TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
free_children(db, blkid, nblks, free_indirects, tx); free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
} }