mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 11:18:52 +03:00
Improved dnode allocation and dmu_hold_impl()
Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the
code, fix errors introduced by commit dbeb879 (PR #6117) interacting
badly with large dnodes, and improve performance.
* When allocating a new dnode in dmu_object_alloc_dnsize(), update the
percpu object ID for the core's metadnode chunk immediately. This
eliminates most lock contention when taking the hold and creating the
dnode.
* Correct detection of the chunk boundary to work properly with large
dnodes.
* Separate the dmu_hold_impl() code for the FREE case from the code for
the ALLOCATED case to make it easier to read.
* Fully populate the dnode handle array immediately after reading a
block of the metadnode from disk. Subsequently the dnode handle array
provides enough information to determine which dnode slots are in use
and which are free.
* Add several kstats to allow the behavior of the code to be examined.
* Verify dnode packing in large_dnode_008_pos.ksh. Since the test is
purely creates, it should leave very few holes in the metadnode.
* Add test large_dnode_009_pos.ksh, which performs concurrent creates
and deletes, to complement existing test which does only creates.
With the above fixes, there is very little contention in a test of about
200,000 racing dnode allocations produced by tests 'large_dnode_008_pos'
and 'large_dnode_009_pos'.
name type data
dnode_hold_dbuf_hold 4 0
dnode_hold_dbuf_read 4 0
dnode_hold_alloc_hits 4 3804690
dnode_hold_alloc_misses 4 216
dnode_hold_alloc_interior 4 3
dnode_hold_alloc_lock_retry 4 0
dnode_hold_alloc_lock_misses 4 0
dnode_hold_alloc_type_none 4 0
dnode_hold_free_hits 4 203105
dnode_hold_free_misses 4 4
dnode_hold_free_lock_misses 4 0
dnode_hold_free_lock_retry 4 0
dnode_hold_free_overflow 4 0
dnode_hold_free_refcount 4 57
dnode_hold_free_txg 4 0
dnode_allocate 4 203154
dnode_reallocate 4 0
dnode_buf_evict 4 23918
dnode_alloc_next_chunk 4 4887
dnode_alloc_race 4 0
dnode_alloc_next_block 4 18
The performance is slightly improved for concurrent creates with
16+ threads, and unchanged for low thread counts.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #5396
Closes #6522
Closes #6414
Closes #6564
This commit is contained in:
committed by
Brian Behlendorf
parent
65dcb0f67a
commit
4c5b89f59e
+45
-23
@@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
* If we finished a chunk of dnodes, get a new one from
|
||||
* the global allocator.
|
||||
*/
|
||||
if (P2PHASE(object, dnodes_per_chunk) == 0) {
|
||||
if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
|
||||
(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
|
||||
dn_slots)) {
|
||||
DNODE_STAT_BUMP(dnode_alloc_next_chunk);
|
||||
mutex_enter(&os->os_obj_lock);
|
||||
ASSERT0(P2PHASE(os->os_obj_next_chunk,
|
||||
dnodes_per_chunk));
|
||||
@@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
mutex_exit(&os->os_obj_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The value of (*cpuobj) before adding dn_slots is the object
|
||||
* ID assigned to us. The value afterwards is the object ID
|
||||
* assigned to whoever wants to do an allocation next.
|
||||
*/
|
||||
object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
|
||||
|
||||
/*
|
||||
* XXX We should check for an i/o error here and return
|
||||
* up to our caller. Actually we should pre-read it in
|
||||
@@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dmu_tx_add_new_object(tx, dn);
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
(void) atomic_swap_64(cpuobj,
|
||||
object + dn_slots);
|
||||
return (object);
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dnode_rele(dn, FTAG);
|
||||
DNODE_STAT_BUMP(dnode_alloc_race);
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip to next known valid starting point on error. This
|
||||
* is the start of the next block of dnodes.
|
||||
*/
|
||||
if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
|
||||
/*
|
||||
* Skip to next known valid starting point for a
|
||||
* dnode.
|
||||
*/
|
||||
object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
|
||||
DNODE_STAT_BUMP(dnode_alloc_next_block);
|
||||
}
|
||||
(void) atomic_swap_64(cpuobj, object);
|
||||
}
|
||||
@@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
|
||||
if (*objectp == 0) {
|
||||
start_obj = 1;
|
||||
} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
|
||||
uint64_t i = *objectp + 1;
|
||||
uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
|
||||
dmu_object_info_t doi;
|
||||
|
||||
/*
|
||||
* For large_dnode datasets, scan from the beginning of the
|
||||
* dnode block to find the starting offset. This is needed
|
||||
* because objectp could be part of a large dnode so we can't
|
||||
* assume it's a hole even if dmu_object_info() returns ENOENT.
|
||||
* Scan through the remaining meta dnode block. The contents
|
||||
* of each slot in the block are known so it can be quickly
|
||||
* checked. If the block is exhausted without a match then
|
||||
* hand off to dnode_next_offset() for further scanning.
|
||||
*/
|
||||
int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
|
||||
int skip;
|
||||
uint64_t i;
|
||||
|
||||
for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
|
||||
dmu_object_info_t doi;
|
||||
|
||||
while (i <= last_obj) {
|
||||
error = dmu_object_info(os, i, &doi);
|
||||
if (error != 0)
|
||||
skip = 1;
|
||||
else
|
||||
skip = doi.doi_dnodesize >> DNODE_SHIFT;
|
||||
if (error == ENOENT) {
|
||||
if (hole) {
|
||||
*objectp = i;
|
||||
return (0);
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
} else if (error == EEXIST) {
|
||||
i++;
|
||||
} else if (error == 0) {
|
||||
if (hole) {
|
||||
i += doi.doi_dnodesize >> DNODE_SHIFT;
|
||||
} else {
|
||||
*objectp = i;
|
||||
return (0);
|
||||
}
|
||||
} else {
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
|
||||
start_obj = i;
|
||||
|
||||
Reference in New Issue
Block a user