mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 19:28:53 +03:00
Decrease contention on dn_struct_rwlock
Currently, sequential async write workloads spend a lot of time contending on the dn_struct_rwlock. This lock is responsible for protecting the entire block tree below it; this naturally results in some serialization during heavy write workloads. This can be resolved by having per-dbuf locking, which will allow multiple writers in the same object at the same time. We introduce a new rwlock, the db_rwlock. This lock is responsible for protecting the contents of the dbuf that it is a part of; when reading a block pointer from a dbuf, you hold the lock as a reader. When writing data to a dbuf, you hold it as a writer. This allows multiple threads to write to different parts of a file at the same time. Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Matt Ahrens matt@delphix.com Reviewed by: George Wilson george.wilson@delphix.com Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Paul Dagnelie <pcd@delphix.com> External-issue: DLPX-52564 External-issue: DLPX-53085 External-issue: DLPX-57384 Closes #8946
This commit is contained in:
committed by
Brian Behlendorf
parent
cb70964221
commit
f664f1ee7f
+29
-6
@@ -51,7 +51,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
|
||||
|
||||
/* this dnode can't be paged out because it's dirty */
|
||||
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
|
||||
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
||||
ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
|
||||
|
||||
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
|
||||
@@ -61,8 +60,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
|
||||
dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
|
||||
dn->dn_object, dn->dn_phys->dn_nlevels);
|
||||
|
||||
/*
|
||||
* Lock ordering requires that we hold the children's db_mutexes (by
|
||||
* calling dbuf_find()) before holding the parent's db_rwlock. The lock
|
||||
* order is imposed by dbuf_read's steps of "grab the lock to protect
|
||||
* db_parent, get db_parent, hold db_parent's db_rwlock".
|
||||
*/
|
||||
dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
|
||||
ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
|
||||
for (i = 0; i < nblkptr; i++) {
|
||||
children[i] =
|
||||
dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
|
||||
}
|
||||
|
||||
/* transfer dnode's block pointers to new indirect block */
|
||||
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
|
||||
if (dn->dn_dbuf != NULL)
|
||||
rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
|
||||
rw_enter(&db->db_rwlock, RW_WRITER);
|
||||
ASSERT(db->db.db_data);
|
||||
ASSERT(arc_released(db->db_buf));
|
||||
ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
|
||||
@@ -72,8 +87,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
|
||||
|
||||
/* set dbuf's parent pointers to new indirect buf */
|
||||
for (i = 0; i < nblkptr; i++) {
|
||||
dmu_buf_impl_t *child =
|
||||
dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
|
||||
dmu_buf_impl_t *child = children[i];
|
||||
|
||||
if (child == NULL)
|
||||
continue;
|
||||
@@ -106,6 +120,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
|
||||
|
||||
bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
|
||||
|
||||
rw_exit(&db->db_rwlock);
|
||||
if (dn->dn_dbuf != NULL)
|
||||
rw_exit(&dn->dn_dbuf->db_rwlock);
|
||||
|
||||
dbuf_rele(db, FTAG);
|
||||
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
@@ -182,7 +200,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
||||
ASSERT(db->db_level == 1);
|
||||
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||
err = dbuf_hold_impl(dn, db->db_level-1,
|
||||
err = dbuf_hold_impl(dn, db->db_level - 1,
|
||||
(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
if (err == ENOENT)
|
||||
@@ -280,7 +298,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
|
||||
* ancestor of the first or last block to be freed. The first and
|
||||
* last L1 indirect blocks are always dirtied by dnode_free_range().
|
||||
*/
|
||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||
VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
|
||||
dmu_buf_unlock_parent(db, dblt, FTAG);
|
||||
|
||||
dbuf_release_bp(db);
|
||||
bp = db->db.db_data;
|
||||
@@ -306,7 +326,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
|
||||
|
||||
if (db->db_level == 1) {
|
||||
FREE_VERIFY(db, start, end, tx);
|
||||
free_blocks(dn, bp, end-start+1, tx);
|
||||
rw_enter(&db->db_rwlock, RW_WRITER);
|
||||
free_blocks(dn, bp, end - start + 1, tx);
|
||||
rw_exit(&db->db_rwlock);
|
||||
} else {
|
||||
for (uint64_t id = start; id <= end; id++, bp++) {
|
||||
if (BP_IS_HOLE(bp))
|
||||
@@ -323,10 +345,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
|
||||
}
|
||||
|
||||
if (free_indirects) {
|
||||
rw_enter(&db->db_rwlock, RW_WRITER);
|
||||
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
|
||||
ASSERT(BP_IS_HOLE(bp));
|
||||
bzero(db->db.db_data, db->db.db_size);
|
||||
free_blocks(dn, db->db_blkptr, 1, tx);
|
||||
rw_exit(&db->db_rwlock);
|
||||
}
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
@@ -378,7 +402,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
|
||||
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
|
||||
TRUE, FALSE, FTAG, &db));
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
free_children(db, blkid, nblks, free_indirects, tx);
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user