Improved dnode allocation and dmu_hold_impl()

Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the code, fix errors introduced by commit dbeb879 (PR #6117) interacting badly with large dnodes, and improve performance. * When allocating a new dnode in dmu_object_alloc_dnsize(), update the percpu object ID for the core's metadnode chunk immediately. This eliminates most lock contention when taking the hold and creating the dnode. * Correct detection of the chunk boundary to work properly with large dnodes. * Separate the dmu_hold_impl() code for the FREE case from the code for the ALLOCATED case to make it easier to read. * Fully populate the dnode handle array immediately after reading a block of the metadnode from disk. Subsequently the dnode handle array provides enough information to determine which dnode slots are in use and which are free. * Add several kstats to allow the behavior of the code to be examined. * Verify dnode packing in large_dnode_008_pos.ksh. Since the test is purely creates, it should leave very few holes in the metadnode. * Add test large_dnode_009_pos.ksh, which performs concurrent creates and deletes, to complement existing test which does only creates. With the above fixes, there is very little contention in a test of about 200,000 racing dnode allocations produced by tests 'large_dnode_008_pos' and 'large_dnode_009_pos'. name type data dnode_hold_dbuf_hold 4 0 dnode_hold_dbuf_read 4 0 dnode_hold_alloc_hits 4 3804690 dnode_hold_alloc_misses 4 216 dnode_hold_alloc_interior 4 3 dnode_hold_alloc_lock_retry 4 0 dnode_hold_alloc_lock_misses 4 0 dnode_hold_alloc_type_none 4 0 dnode_hold_free_hits 4 203105 dnode_hold_free_misses 4 4 dnode_hold_free_lock_misses 4 0 dnode_hold_free_lock_retry 4 0 dnode_hold_free_overflow 4 0 dnode_hold_free_refcount 4 57 dnode_hold_free_txg 4 0 dnode_allocate 4 203154 dnode_reallocate 4 0 dnode_buf_evict 4 23918 dnode_alloc_next_chunk 4 4887 dnode_alloc_race 4 0 dnode_alloc_next_block 4 18 The performance is slightly improved for concurrent creates with 16+ threads, and unchanged for low thread counts. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Olaf Faaland <faaland1@llnl.gov> Closes #5396 Closes #6522 Closes #6414 Closes #6564
2026-05-24 11:18:52 +03:00 · 2017-09-05 16:15:04 -07:00
parent 65dcb0f67a
commit 4c5b89f59e
9 changed files with 616 additions and 258 deletions
@@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
 		 * If we finished a chunk of dnodes, get a new one from
 		 * the global allocator.
 		 */
-		if (P2PHASE(object, dnodes_per_chunk) == 0) {
+		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+		    dn_slots)) {
+			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
 			mutex_enter(&os->os_obj_lock);
 			ASSERT0(P2PHASE(os->os_obj_next_chunk,
 			    dnodes_per_chunk));
@@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
 			mutex_exit(&os->os_obj_lock);
 		}

+		/*
+		 * The value of (*cpuobj) before adding dn_slots is the object
+		 * ID assigned to us.  The value afterwards is the object ID
+		 * assigned to whoever wants to do an allocation next.
+		 */
+		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
 		/*
 		 * XXX We should check for an i/o error here and return
 		 * up to our caller.  Actually we should pre-read it in
@@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_tx_add_new_object(tx, dn);
 				dnode_rele(dn, FTAG);
-
-				(void) atomic_swap_64(cpuobj,
-				    object + dn_slots);
 				return (object);
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
+			DNODE_STAT_BUMP(dnode_alloc_race);
 		}

+		/*
+		 * Skip to next known valid starting point on error.  This
+		 * is the start of the next block of dnodes.
+		 */
 		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
-			/*
-			 * Skip to next known valid starting point for a
-			 * dnode.
-			 */
 			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+			DNODE_STAT_BUMP(dnode_alloc_next_block);
 		}
 		(void) atomic_swap_64(cpuobj, object);
 	}
@@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 	if (*objectp == 0) {
 		start_obj = 1;
 	} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+		uint64_t i = *objectp + 1;
+		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+		dmu_object_info_t doi;
+
 		/*
-		 * For large_dnode datasets, scan from the beginning of the
-		 * dnode block to find the starting offset. This is needed
-		 * because objectp could be part of a large dnode so we can't
-		 * assume it's a hole even if dmu_object_info() returns ENOENT.
+		 * Scan through the remaining meta dnode block.  The contents
+		 * of each slot in the block are known so it can be quickly
+		 * checked.  If the block is exhausted without a match then
+		 * hand off to dnode_next_offset() for further scanning.
 		 */
-		int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
-		int skip;
-		uint64_t i;
-
-		for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
-			dmu_object_info_t doi;
-
+		while (i <= last_obj) {
 			error = dmu_object_info(os, i, &doi);
-			if (error != 0)
-				skip = 1;
-			else
-				skip = doi.doi_dnodesize >> DNODE_SHIFT;
+			if (error == ENOENT) {
+				if (hole) {
+					*objectp = i;
+					return (0);
+				} else {
+					i++;
+				}
+			} else if (error == EEXIST) {
+				i++;
+			} else if (error == 0) {
+				if (hole) {
+					i += doi.doi_dnodesize >> DNODE_SHIFT;
+				} else {
+					*objectp = i;
+					return (0);
+				}
+			} else {
+				return (error);
+			}
 		}

 		start_obj = i;