mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 03:09:34 +03:00
Improved dnode allocation and dmu_hold_impl() (#6611)
Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the
code, fix errors introduced by commit dbeb879
(PR #6117) interacting
badly with large dnodes, and improve performance.
* When allocating a new dnode in dmu_object_alloc_dnsize(), update the
percpu object ID for the core's metadnode chunk immediately. This
eliminates most lock contention when taking the hold and creating the
dnode.
* Correct detection of the chunk boundary to work properly with large
dnodes.
* Separate the dmu_hold_impl() code for the FREE case from the code for
the ALLOCATED case to make it easier to read.
* Fully populate the dnode handle array immediately after reading a
block of the metadnode from disk. Subsequently the dnode handle array
provides enough information to determine which dnode slots are in use
and which are free.
* Add several kstats to allow the behavior of the code to be examined.
* Verify dnode packing in large_dnode_008_pos.ksh. Since the test is
purely creates, it should leave very few holes in the metadnode.
* Add test large_dnode_009_pos.ksh, which performs concurrent creates
and deletes, to complement existing test which does only creates.
With the above fixes, there is very little contention in a test of about
200,000 racing dnode allocations produced by tests 'large_dnode_008_pos'
and 'large_dnode_009_pos'.
name type data
dnode_hold_dbuf_hold 4 0
dnode_hold_dbuf_read 4 0
dnode_hold_alloc_hits 4 3804690
dnode_hold_alloc_misses 4 216
dnode_hold_alloc_interior 4 3
dnode_hold_alloc_lock_retry 4 0
dnode_hold_alloc_lock_misses 4 0
dnode_hold_alloc_type_none 4 0
dnode_hold_free_hits 4 203105
dnode_hold_free_misses 4 4
dnode_hold_free_lock_misses 4 0
dnode_hold_free_lock_retry 4 0
dnode_hold_free_overflow 4 0
dnode_hold_free_refcount 4 57
dnode_hold_free_txg 4 0
dnode_allocate 4 203154
dnode_reallocate 4 0
dnode_buf_evict 4 23918
dnode_alloc_next_chunk 4 4887
dnode_alloc_race 4 0
dnode_alloc_next_block 4 18
The performance is slightly improved for concurrent creates with
16+ threads, and unchanged for low thread counts.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
This commit is contained in:
parent
89950722c6
commit
45d1abc74d
@ -1933,7 +1933,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
|
||||
};
|
||||
|
||||
static void
|
||||
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
|
||||
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
|
||||
uint64_t *dnode_slots_used)
|
||||
{
|
||||
dmu_buf_t *db = NULL;
|
||||
dmu_object_info_t doi;
|
||||
@ -1965,6 +1966,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
|
||||
}
|
||||
dmu_object_info_from_dnode(dn, &doi);
|
||||
|
||||
if (dnode_slots_used)
|
||||
*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
|
||||
|
||||
zdb_nicenum(doi.doi_metadata_block_size, iblk);
|
||||
zdb_nicenum(doi.doi_data_block_size, dblk);
|
||||
zdb_nicenum(doi.doi_max_offset, lsize);
|
||||
@ -2072,6 +2076,9 @@ dump_dir(objset_t *os)
|
||||
int verbosity = dump_opt['d'];
|
||||
int print_header = 1;
|
||||
int i, error;
|
||||
uint64_t total_slots_used = 0;
|
||||
uint64_t max_slot_used = 0;
|
||||
uint64_t dnode_slots;
|
||||
|
||||
dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
|
||||
dmu_objset_fast_stat(os, &dds);
|
||||
@ -2112,7 +2119,7 @@ dump_dir(objset_t *os)
|
||||
if (zopt_objects != 0) {
|
||||
for (i = 0; i < zopt_objects; i++)
|
||||
dump_object(os, zopt_object[i], verbosity,
|
||||
&print_header);
|
||||
&print_header, NULL);
|
||||
(void) printf("\n");
|
||||
return;
|
||||
}
|
||||
@ -2129,24 +2136,39 @@ dump_dir(objset_t *os)
|
||||
if (BP_IS_HOLE(os->os_rootbp))
|
||||
return;
|
||||
|
||||
dump_object(os, 0, verbosity, &print_header);
|
||||
dump_object(os, 0, verbosity, &print_header, NULL);
|
||||
object_count = 0;
|
||||
if (DMU_USERUSED_DNODE(os) != NULL &&
|
||||
DMU_USERUSED_DNODE(os)->dn_type != 0) {
|
||||
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
|
||||
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
|
||||
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
|
||||
NULL);
|
||||
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
|
||||
NULL);
|
||||
}
|
||||
|
||||
object = 0;
|
||||
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
|
||||
dump_object(os, object, verbosity, &print_header);
|
||||
dump_object(os, object, verbosity, &print_header, &dnode_slots);
|
||||
object_count++;
|
||||
total_slots_used += dnode_slots;
|
||||
max_slot_used = object + dnode_slots - 1;
|
||||
}
|
||||
|
||||
ASSERT3U(object_count, ==, usedobjs);
|
||||
|
||||
(void) printf("\n");
|
||||
|
||||
(void) printf(" Dnode slots:\n");
|
||||
(void) printf("\tTotal used: %10llu\n",
|
||||
(u_longlong_t)total_slots_used);
|
||||
(void) printf("\tMax used: %10llu\n",
|
||||
(u_longlong_t)max_slot_used);
|
||||
(void) printf("\tPercent empty: %10lf\n",
|
||||
(double)(max_slot_used - total_slots_used)*100 /
|
||||
(double)max_slot_used);
|
||||
|
||||
(void) printf("\n");
|
||||
|
||||
if (error != ESRCH) {
|
||||
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
|
||||
abort();
|
||||
@ -2610,7 +2632,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
|
||||
return (dump_path_impl(os, child_obj, s + 1));
|
||||
/*FALLTHROUGH*/
|
||||
case DMU_OT_PLAIN_FILE_CONTENTS:
|
||||
dump_object(os, child_obj, dump_opt['v'], &header);
|
||||
dump_object(os, child_obj, dump_opt['v'], &header, NULL);
|
||||
return (0);
|
||||
default:
|
||||
(void) fprintf(stderr, "object %llu has non-file/directory "
|
||||
|
@ -100,6 +100,13 @@ extern "C" {
|
||||
#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
|
||||
#define DN_KILL_SPILLBLK (1)
|
||||
|
||||
#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
|
||||
#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
|
||||
#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
|
||||
#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
|
||||
#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
|
||||
#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
|
||||
|
||||
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
|
||||
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
|
||||
|
||||
@ -363,6 +370,135 @@ void dnode_evict_bonus(dnode_t *dn);
|
||||
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
|
||||
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
|
||||
|
||||
/*
|
||||
* Used for dnodestats kstat.
|
||||
*/
|
||||
typedef struct dnode_stats {
|
||||
/*
|
||||
* Number of failed attempts to hold a meta dnode dbuf.
|
||||
*/
|
||||
kstat_named_t dnode_hold_dbuf_hold;
|
||||
/*
|
||||
* Number of failed attempts to read a meta dnode dbuf.
|
||||
*/
|
||||
kstat_named_t dnode_hold_dbuf_read;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
|
||||
* to hold the requested object number which was allocated. This is
|
||||
* the common case when looking up any allocated object number.
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_hits;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
|
||||
* able to hold the request object number because it was not allocated.
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_misses;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
|
||||
* able to hold the request object number because the object number
|
||||
* refers to an interior large dnode slot.
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_interior;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
|
||||
* to retry acquiring slot zrl locks due to contention.
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_lock_retry;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
|
||||
* need to create the dnode because another thread did so after
|
||||
* dropping the read lock but before acquiring the write lock.
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_lock_misses;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
|
||||
* a free dnode instantiated by dnode_create() but not yet allocated
|
||||
* by dnode_allocate().
|
||||
*/
|
||||
kstat_named_t dnode_hold_alloc_type_none;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
|
||||
* to hold the requested range of free dnode slots.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_hits;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
|
||||
* able to hold the requested range of free dnode slots because
|
||||
* at least one slot was allocated.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_misses;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
|
||||
* able to hold the requested range of free dnode slots because
|
||||
* after acquiring the zrl lock at least one slot was allocated.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_lock_misses;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
|
||||
* to retry acquiring slot zrl locks due to contention.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_lock_retry;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
|
||||
* a range of dnode slots which were held by another thread.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_refcount;
|
||||
/*
|
||||
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
|
||||
* a range of dnode slots which would overflow the dnode_phys_t.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_overflow;
|
||||
/*
|
||||
* Number of times a dnode_hold(...) was attempted on a dnode
|
||||
* which had already been unlinked in an earlier txg.
|
||||
*/
|
||||
kstat_named_t dnode_hold_free_txg;
|
||||
/*
|
||||
* Number of new dnodes allocated by dnode_allocate().
|
||||
*/
|
||||
kstat_named_t dnode_allocate;
|
||||
/*
|
||||
* Number of dnodes re-allocated by dnode_reallocate().
|
||||
*/
|
||||
kstat_named_t dnode_reallocate;
|
||||
/*
|
||||
* Number of meta dnode dbufs evicted.
|
||||
*/
|
||||
kstat_named_t dnode_buf_evict;
|
||||
/*
|
||||
* Number of times dmu_object_alloc*() reached the end of the existing
|
||||
* object ID chunk and advanced to a new one.
|
||||
*/
|
||||
kstat_named_t dnode_alloc_next_chunk;
|
||||
/*
|
||||
* Number of times multiple threads attempted to allocate a dnode
|
||||
* from the same block of free dnodes.
|
||||
*/
|
||||
kstat_named_t dnode_alloc_race;
|
||||
/*
|
||||
* Number of times dmu_object_alloc*() was forced to advance to the
|
||||
* next meta dnode dbuf due to an error from dmu_object_next().
|
||||
*/
|
||||
kstat_named_t dnode_alloc_next_block;
|
||||
/*
|
||||
* Statistics for tracking dnodes which have been moved.
|
||||
*/
|
||||
kstat_named_t dnode_move_invalid;
|
||||
kstat_named_t dnode_move_recheck1;
|
||||
kstat_named_t dnode_move_recheck2;
|
||||
kstat_named_t dnode_move_special;
|
||||
kstat_named_t dnode_move_handle;
|
||||
kstat_named_t dnode_move_rwlock;
|
||||
kstat_named_t dnode_move_active;
|
||||
} dnode_stats_t;
|
||||
|
||||
extern dnode_stats_t dnode_stats;
|
||||
|
||||
#define DNODE_STAT_INCR(stat, val) \
|
||||
atomic_add_64(&dnode_stats.stat.value.ui64, (val));
|
||||
#define DNODE_STAT_BUMP(stat) \
|
||||
DNODE_STAT_INCR(stat, 1);
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
|
||||
/*
|
||||
|
@ -72,8 +72,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
|
||||
if (db->db_buf)
|
||||
arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
|
||||
|
||||
if (dn)
|
||||
__dmu_object_info_from_dnode(dn, &doi);
|
||||
__dmu_object_info_from_dnode(dn, &doi);
|
||||
|
||||
nwritten = snprintf(buf, size,
|
||||
"%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
|
||||
|
@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
* If we finished a chunk of dnodes, get a new one from
|
||||
* the global allocator.
|
||||
*/
|
||||
if (P2PHASE(object, dnodes_per_chunk) == 0) {
|
||||
if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
|
||||
(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
|
||||
dn_slots)) {
|
||||
DNODE_STAT_BUMP(dnode_alloc_next_chunk);
|
||||
mutex_enter(&os->os_obj_lock);
|
||||
ASSERT0(P2PHASE(os->os_obj_next_chunk,
|
||||
dnodes_per_chunk));
|
||||
@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
mutex_exit(&os->os_obj_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The value of (*cpuobj) before adding dn_slots is the object
|
||||
* ID assigned to us. The value afterwards is the object ID
|
||||
* assigned to whoever wants to do an allocation next.
|
||||
*/
|
||||
object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
|
||||
|
||||
/*
|
||||
* XXX We should check for an i/o error here and return
|
||||
* up to our caller. Actually we should pre-read it in
|
||||
@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dmu_tx_add_new_object(tx, dn);
|
||||
dnode_rele(dn, FTAG);
|
||||
|
||||
(void) atomic_swap_64(cpuobj,
|
||||
object + dn_slots);
|
||||
return (object);
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dnode_rele(dn, FTAG);
|
||||
DNODE_STAT_BUMP(dnode_alloc_race);
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip to next known valid starting point on error. This
|
||||
* is the start of the next block of dnodes.
|
||||
*/
|
||||
if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
|
||||
/*
|
||||
* Skip to next known valid starting point for a
|
||||
* dnode.
|
||||
*/
|
||||
object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
|
||||
DNODE_STAT_BUMP(dnode_alloc_next_block);
|
||||
}
|
||||
(void) atomic_swap_64(cpuobj, object);
|
||||
}
|
||||
@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
|
||||
if (*objectp == 0) {
|
||||
start_obj = 1;
|
||||
} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
|
||||
uint64_t i = *objectp + 1;
|
||||
uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
|
||||
dmu_object_info_t doi;
|
||||
|
||||
/*
|
||||
* For large_dnode datasets, scan from the beginning of the
|
||||
* dnode block to find the starting offset. This is needed
|
||||
* because objectp could be part of a large dnode so we can't
|
||||
* assume it's a hole even if dmu_object_info() returns ENOENT.
|
||||
* Scan through the remaining meta dnode block. The contents
|
||||
* of each slot in the block are known so it can be quickly
|
||||
* checked. If the block is exhausted without a match then
|
||||
* hand off to dnode_next_offset() for further scanning.
|
||||
*/
|
||||
int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
|
||||
int skip;
|
||||
uint64_t i;
|
||||
|
||||
for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
|
||||
dmu_object_info_t doi;
|
||||
|
||||
while (i <= last_obj) {
|
||||
error = dmu_object_info(os, i, &doi);
|
||||
if (error)
|
||||
skip = 1;
|
||||
else
|
||||
skip = doi.doi_dnodesize >> DNODE_SHIFT;
|
||||
if (error == ENOENT) {
|
||||
if (hole) {
|
||||
*objectp = i;
|
||||
return (0);
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
} else if (error == EEXIST) {
|
||||
i++;
|
||||
} else if (error == 0) {
|
||||
if (hole) {
|
||||
i += doi.doi_dnodesize >> DNODE_SHIFT;
|
||||
} else {
|
||||
*objectp = i;
|
||||
return (0);
|
||||
}
|
||||
} else {
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
|
||||
start_obj = i;
|
||||
|
@ -39,20 +39,39 @@
|
||||
#include <sys/range_tree.h>
|
||||
#include <sys/trace_dnode.h>
|
||||
|
||||
static kmem_cache_t *dnode_cache;
|
||||
/*
|
||||
* Define DNODE_STATS to turn on statistic gathering. By default, it is only
|
||||
* turned on when DEBUG is also defined.
|
||||
*/
|
||||
#ifdef DEBUG
|
||||
#define DNODE_STATS
|
||||
#endif /* DEBUG */
|
||||
dnode_stats_t dnode_stats = {
|
||||
{ "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_allocate", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_buf_evict", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_alloc_race", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_invalid", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_recheck1", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_recheck2", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_special", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_handle", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_rwlock", KSTAT_DATA_UINT64 },
|
||||
{ "dnode_move_active", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#ifdef DNODE_STATS
|
||||
#define DNODE_STAT_ADD(stat) ((stat)++)
|
||||
#else
|
||||
#define DNODE_STAT_ADD(stat) /* nothing */
|
||||
#endif /* DNODE_STATS */
|
||||
static kstat_t *dnode_ksp;
|
||||
static kmem_cache_t *dnode_cache;
|
||||
|
||||
ASSERTV(static dnode_phys_t dnode_phys_zero);
|
||||
|
||||
@ -203,11 +222,24 @@ dnode_init(void)
|
||||
dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
|
||||
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
|
||||
kmem_cache_set_move(dnode_cache, dnode_move);
|
||||
|
||||
dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
|
||||
KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
|
||||
KSTAT_FLAG_VIRTUAL);
|
||||
if (dnode_ksp != NULL) {
|
||||
dnode_ksp->ks_data = &dnode_stats;
|
||||
kstat_install(dnode_ksp);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dnode_fini(void)
|
||||
{
|
||||
if (dnode_ksp != NULL) {
|
||||
kstat_delete(dnode_ksp);
|
||||
dnode_ksp = NULL;
|
||||
}
|
||||
|
||||
kmem_cache_destroy(dnode_cache);
|
||||
dnode_cache = NULL;
|
||||
}
|
||||
@ -391,7 +423,7 @@ dnode_setdblksz(dnode_t *dn, int size)
|
||||
}
|
||||
|
||||
static dnode_t *
|
||||
dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
|
||||
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
|
||||
uint64_t object, dnode_handle_t *dnh)
|
||||
{
|
||||
dnode_t *dn;
|
||||
@ -424,26 +456,18 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
|
||||
dn->dn_compress = dnp->dn_compress;
|
||||
dn->dn_bonustype = dnp->dn_bonustype;
|
||||
dn->dn_bonuslen = dnp->dn_bonuslen;
|
||||
dn->dn_num_slots = dnp->dn_extra_slots + 1;
|
||||
dn->dn_maxblkid = dnp->dn_maxblkid;
|
||||
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
|
||||
dn->dn_id_flags = 0;
|
||||
|
||||
if (slots && dn->dn_type == DMU_OT_NONE)
|
||||
dn->dn_num_slots = slots;
|
||||
else
|
||||
dn->dn_num_slots = dnp->dn_extra_slots + 1;
|
||||
|
||||
dmu_zfetch_init(&dn->dn_zfetch, dn);
|
||||
|
||||
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
|
||||
ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
|
||||
ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
|
||||
|
||||
mutex_enter(&os->os_lock);
|
||||
if (dnh->dnh_dnode != NULL) {
|
||||
/* Lost the allocation race. */
|
||||
mutex_exit(&os->os_lock);
|
||||
kmem_cache_free(dnode_cache, dn);
|
||||
return (dnh->dnh_dnode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Exclude special dnodes from os_dnodes so an empty os_dnodes
|
||||
@ -466,6 +490,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
|
||||
mutex_exit(&os->os_lock);
|
||||
|
||||
arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
|
||||
|
||||
return (dn);
|
||||
}
|
||||
|
||||
@ -549,6 +574,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
|
||||
|
||||
dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
|
||||
dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
|
||||
DNODE_STAT_BUMP(dnode_allocate);
|
||||
|
||||
ASSERT(dn->dn_type == DMU_OT_NONE);
|
||||
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
|
||||
@ -636,6 +662,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
|
||||
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
|
||||
|
||||
dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
|
||||
DNODE_STAT_BUMP(dnode_reallocate);
|
||||
|
||||
/* clean up any unreferenced dbufs */
|
||||
dnode_evict_dbufs(dn);
|
||||
@ -697,18 +724,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
|
||||
}
|
||||
|
||||
#ifdef _KERNEL
|
||||
#ifdef DNODE_STATS
|
||||
static struct {
|
||||
uint64_t dms_dnode_invalid;
|
||||
uint64_t dms_dnode_recheck1;
|
||||
uint64_t dms_dnode_recheck2;
|
||||
uint64_t dms_dnode_special;
|
||||
uint64_t dms_dnode_handle;
|
||||
uint64_t dms_dnode_rwlock;
|
||||
uint64_t dms_dnode_active;
|
||||
} dnode_move_stats;
|
||||
#endif /* DNODE_STATS */
|
||||
|
||||
static void
|
||||
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
|
||||
{
|
||||
@ -866,7 +881,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
*/
|
||||
os = odn->dn_objset;
|
||||
if (!POINTER_IS_VALID(os)) {
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
|
||||
DNODE_STAT_BUMP(dnode_move_invalid);
|
||||
return (KMEM_CBRC_DONT_KNOW);
|
||||
}
|
||||
|
||||
@ -876,7 +891,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
rw_enter(&os_lock, RW_WRITER);
|
||||
if (os != odn->dn_objset) {
|
||||
rw_exit(&os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
|
||||
DNODE_STAT_BUMP(dnode_move_recheck1);
|
||||
return (KMEM_CBRC_DONT_KNOW);
|
||||
}
|
||||
|
||||
@ -894,7 +909,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
if (os != odn->dn_objset) {
|
||||
mutex_exit(&os->os_lock);
|
||||
rw_exit(&os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
|
||||
DNODE_STAT_BUMP(dnode_move_recheck2);
|
||||
return (KMEM_CBRC_DONT_KNOW);
|
||||
}
|
||||
|
||||
@ -907,7 +922,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
rw_exit(&os_lock);
|
||||
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
|
||||
mutex_exit(&os->os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
|
||||
DNODE_STAT_BUMP(dnode_move_special);
|
||||
return (KMEM_CBRC_NO);
|
||||
}
|
||||
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
|
||||
@ -922,7 +937,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
*/
|
||||
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
|
||||
mutex_exit(&os->os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
|
||||
DNODE_STAT_BUMP(dnode_move_handle);
|
||||
return (KMEM_CBRC_LATER);
|
||||
}
|
||||
|
||||
@ -938,7 +953,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
|
||||
zrl_exit(&odn->dn_handle->dnh_zrlock);
|
||||
mutex_exit(&os->os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
|
||||
DNODE_STAT_BUMP(dnode_move_rwlock);
|
||||
return (KMEM_CBRC_LATER);
|
||||
}
|
||||
|
||||
@ -964,7 +979,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
rw_exit(&odn->dn_struct_rwlock);
|
||||
zrl_exit(&odn->dn_handle->dnh_zrlock);
|
||||
mutex_exit(&os->os_lock);
|
||||
DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
|
||||
DNODE_STAT_BUMP(dnode_move_active);
|
||||
return (KMEM_CBRC_LATER);
|
||||
}
|
||||
|
||||
@ -988,6 +1003,78 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
|
||||
}
|
||||
#endif /* _KERNEL */
|
||||
|
||||
static void
|
||||
dnode_slots_hold(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_slots_rele(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
|
||||
if (zrl_is_locked(&dnh->dnh_zrlock))
|
||||
zrl_exit(&dnh->dnh_zrlock);
|
||||
else
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
|
||||
if (!zrl_tryenter(&dnh->dnh_zrlock)) {
|
||||
for (int j = idx; j < i; j++) {
|
||||
dnh = &children->dnc_children[j];
|
||||
zrl_exit(&dnh->dnh_zrlock);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
|
||||
{
|
||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
dnh->dnh_dnode = ptr;
|
||||
}
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
|
||||
{
|
||||
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
if (dnh->dnh_dnode != ptr)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
void
|
||||
dnode_special_close(dnode_handle_t *dnh)
|
||||
{
|
||||
@ -995,7 +1082,7 @@ dnode_special_close(dnode_handle_t *dnh)
|
||||
|
||||
/*
|
||||
* Wait for final references to the dnode to clear. This can
|
||||
* only happen if the arc is asyncronously evicting state that
|
||||
* only happen if the arc is asynchronously evicting state that
|
||||
* has a hold on this dnode while we are trying to evict this
|
||||
* dnode.
|
||||
*/
|
||||
@ -1015,19 +1102,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
|
||||
{
|
||||
dnode_t *dn;
|
||||
|
||||
dn = dnode_create(os, dnp, 0, NULL, object, dnh);
|
||||
zrl_init(&dnh->dnh_zrlock);
|
||||
zrl_tryenter(&dnh->dnh_zrlock);
|
||||
|
||||
dn = dnode_create(os, dnp, NULL, object, dnh);
|
||||
DNODE_VERIFY(dn);
|
||||
|
||||
zrl_exit(&dnh->dnh_zrlock);
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_buf_evict_async(void *dbu)
|
||||
{
|
||||
dnode_children_t *children_dnodes = dbu;
|
||||
int i;
|
||||
dnode_children_t *dnc = dbu;
|
||||
|
||||
for (i = 0; i < children_dnodes->dnc_count; i++) {
|
||||
dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
|
||||
DNODE_STAT_BUMP(dnode_buf_evict);
|
||||
|
||||
for (int i = 0; i < dnc->dnc_count; i++) {
|
||||
dnode_handle_t *dnh = &dnc->dnc_children[i];
|
||||
dnode_t *dn;
|
||||
|
||||
/*
|
||||
@ -1035,8 +1127,9 @@ dnode_buf_evict_async(void *dbu)
|
||||
* another valid address, so there is no need here to guard
|
||||
* against changes to or from NULL.
|
||||
*/
|
||||
if (dnh->dnh_dnode == NULL) {
|
||||
if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
|
||||
zrl_destroy(&dnh->dnh_zrlock);
|
||||
dnh->dnh_dnode = DN_SLOT_UNINIT;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1051,150 +1144,37 @@ dnode_buf_evict_async(void *dbu)
|
||||
ASSERT(refcount_is_zero(&dn->dn_holds));
|
||||
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
|
||||
|
||||
dnode_destroy(dn); /* implicit zrl_remove() */
|
||||
dnode_destroy(dn); /* implicit zrl_remove() for first slot */
|
||||
zrl_destroy(&dnh->dnh_zrlock);
|
||||
dnh->dnh_dnode = NULL;
|
||||
}
|
||||
kmem_free(children_dnodes, sizeof (dnode_children_t) +
|
||||
children_dnodes->dnc_count * sizeof (dnode_handle_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if the given index is interior to a dnode already
|
||||
* allocated in the block. That is, the index is neither free nor
|
||||
* allocated, but is consumed by a large dnode.
|
||||
*
|
||||
* The dnode_phys_t buffer may not be in sync with the in-core dnode
|
||||
* structure, so we try to check the dnode structure first and fall back
|
||||
* to the dnode_phys_t buffer it doesn't exist. When an in-code dnode
|
||||
* exists we can always trust dn->dn_num_slots to be accurate, even for
|
||||
* a held dnode which has not yet been fully allocated.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
|
||||
{
|
||||
int skip, i;
|
||||
|
||||
for (i = 0; i < idx; i += skip) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
|
||||
if (dnh->dnh_dnode != NULL) {
|
||||
skip = dnh->dnh_dnode->dn_num_slots;
|
||||
} else {
|
||||
if (dn_block[i].dn_type != DMU_OT_NONE)
|
||||
skip = dn_block[i].dn_extra_slots + 1;
|
||||
else
|
||||
skip = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return (i > idx);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if the given index in the dnode block is a valid
|
||||
* allocated dnode. That is, the index is not consumed by a large
|
||||
* dnode and is not free.
|
||||
*
|
||||
* The dnode_phys_t buffer may not be in sync with the in-core dnode
|
||||
* structure, so we try to check the dnode structure first and fall back
|
||||
* to the dnode_phys_t buffer it doesn't exist.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
|
||||
{
|
||||
dnode_handle_t *dnh;
|
||||
dmu_object_type_t ot;
|
||||
|
||||
if (dnode_is_consumed(children, dn_block, idx))
|
||||
return (B_FALSE);
|
||||
|
||||
dnh = &children->dnc_children[idx];
|
||||
if (dnh->dnh_dnode != NULL)
|
||||
ot = dnh->dnh_dnode->dn_type;
|
||||
else
|
||||
ot = dn_block[idx].dn_type;
|
||||
|
||||
return (ot != DMU_OT_NONE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if the given range of indices in the dnode block are
|
||||
* free. That is, the starting index is not consumed by a large dnode
|
||||
* and none of the indices are allocated.
|
||||
*
|
||||
* The dnode_phys_t buffer may not be in sync with the in-core dnode
|
||||
* structure, so we try to check the dnode structure first and fall back
|
||||
* to the dnode_phys_t buffer it doesn't exist.
|
||||
*/
|
||||
static boolean_t
|
||||
dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
|
||||
int slots)
|
||||
{
|
||||
if (idx + slots > DNODES_PER_BLOCK)
|
||||
return (B_FALSE);
|
||||
|
||||
if (dnode_is_consumed(children, dn_block, idx))
|
||||
return (B_FALSE);
|
||||
|
||||
for (int i = idx; i < idx + slots; i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
dmu_object_type_t ot;
|
||||
|
||||
if (dnh->dnh_dnode != NULL) {
|
||||
if (dnh->dnh_dnode->dn_num_slots > 1)
|
||||
return (B_FALSE);
|
||||
|
||||
ot = dnh->dnh_dnode->dn_type;
|
||||
} else {
|
||||
ot = dn_block[i].dn_type;
|
||||
}
|
||||
|
||||
if (ot != DMU_OT_NONE)
|
||||
return (B_FALSE);
|
||||
}
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_hold_slots(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
zrl_add(&dnh->dnh_zrlock);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dnode_rele_slots(dnode_children_t *children, int idx, int slots)
|
||||
{
|
||||
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
|
||||
dnode_handle_t *dnh = &children->dnc_children[i];
|
||||
zrl_remove(&dnh->dnh_zrlock);
|
||||
dnh->dnh_dnode = DN_SLOT_UNINIT;
|
||||
}
|
||||
kmem_free(dnc, sizeof (dnode_children_t) +
|
||||
dnc->dnc_count * sizeof (dnode_handle_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* errors:
|
||||
* EINVAL - invalid object number.
|
||||
* ENOSPC - hole too small to fulfill "slots" request
|
||||
* ENOENT - the requested dnode is not allocated
|
||||
* EIO - i/o error.
|
||||
* EINVAL - Invalid object number or flags.
|
||||
* ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
|
||||
* EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
|
||||
* - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
|
||||
* ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
|
||||
* EIO - I/O error when reading the meta dnode dbuf.
|
||||
*
|
||||
* succeeds even for free dnodes.
|
||||
*/
|
||||
int
|
||||
dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
void *tag, dnode_t **dnp)
|
||||
{
|
||||
int epb, idx, err, i;
|
||||
int epb, idx, err;
|
||||
int drop_struct_lock = FALSE;
|
||||
int type;
|
||||
uint64_t blk;
|
||||
dnode_t *mdn, *dn;
|
||||
dmu_buf_impl_t *db;
|
||||
dnode_children_t *children_dnodes;
|
||||
dnode_phys_t *dn_block_begin;
|
||||
dnode_children_t *dnc;
|
||||
dnode_phys_t *dn_block;
|
||||
dnode_handle_t *dnh;
|
||||
|
||||
ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
|
||||
@ -1244,10 +1224,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
db = dbuf_hold(mdn, blk, FTAG);
|
||||
if (drop_struct_lock)
|
||||
rw_exit(&mdn->dn_struct_rwlock);
|
||||
if (db == NULL)
|
||||
if (db == NULL) {
|
||||
DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
|
||||
if (err) {
|
||||
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (err);
|
||||
}
|
||||
@ -1255,72 +1238,179 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
|
||||
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
|
||||
epb = db->db.db_size >> DNODE_SHIFT;
|
||||
|
||||
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
|
||||
children_dnodes = dmu_buf_get_user(&db->db);
|
||||
if (children_dnodes == NULL) {
|
||||
dnode_children_t *winner;
|
||||
children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
|
||||
epb * sizeof (dnode_handle_t), KM_SLEEP);
|
||||
children_dnodes->dnc_count = epb;
|
||||
dnh = &children_dnodes->dnc_children[0];
|
||||
for (i = 0; i < epb; i++) {
|
||||
zrl_init(&dnh[i].dnh_zrlock);
|
||||
}
|
||||
dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
|
||||
dnode_buf_evict_async, NULL);
|
||||
winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
|
||||
if (winner != NULL) {
|
||||
idx = object & (epb - 1);
|
||||
dn_block = (dnode_phys_t *)db->db.db_data;
|
||||
|
||||
for (i = 0; i < epb; i++) {
|
||||
zrl_destroy(&dnh[i].dnh_zrlock);
|
||||
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
|
||||
dnc = dmu_buf_get_user(&db->db);
|
||||
dnh = NULL;
|
||||
if (dnc == NULL) {
|
||||
dnode_children_t *winner;
|
||||
int skip = 0;
|
||||
|
||||
dnc = kmem_zalloc(sizeof (dnode_children_t) +
|
||||
epb * sizeof (dnode_handle_t), KM_SLEEP);
|
||||
dnc->dnc_count = epb;
|
||||
dnh = &dnc->dnc_children[0];
|
||||
|
||||
/* Initialize dnode slot status from dnode_phys_t */
|
||||
for (int i = 0; i < epb; i++) {
|
||||
zrl_init(&dnh[i].dnh_zrlock);
|
||||
|
||||
if (skip) {
|
||||
skip--;
|
||||
continue;
|
||||
}
|
||||
|
||||
kmem_free(children_dnodes, sizeof (dnode_children_t) +
|
||||
if (dn_block[i].dn_type != DMU_OT_NONE) {
|
||||
int interior = dn_block[i].dn_extra_slots;
|
||||
|
||||
dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
|
||||
dnode_set_slots(dnc, i + 1, interior,
|
||||
DN_SLOT_INTERIOR);
|
||||
skip = interior;
|
||||
} else {
|
||||
dnh[i].dnh_dnode = DN_SLOT_FREE;
|
||||
skip = 0;
|
||||
}
|
||||
}
|
||||
|
||||
dmu_buf_init_user(&dnc->dnc_dbu, NULL,
|
||||
dnode_buf_evict_async, NULL);
|
||||
winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
|
||||
if (winner != NULL) {
|
||||
|
||||
for (int i = 0; i < epb; i++)
|
||||
zrl_destroy(&dnh[i].dnh_zrlock);
|
||||
|
||||
kmem_free(dnc, sizeof (dnode_children_t) +
|
||||
epb * sizeof (dnode_handle_t));
|
||||
children_dnodes = winner;
|
||||
dnc = winner;
|
||||
}
|
||||
}
|
||||
ASSERT(children_dnodes->dnc_count == epb);
|
||||
|
||||
idx = object & (epb - 1);
|
||||
dn_block_begin = (dnode_phys_t *)db->db.db_data;
|
||||
ASSERT(dnc->dnc_count == epb);
|
||||
dn = DN_SLOT_UNINIT;
|
||||
|
||||
dnode_hold_slots(children_dnodes, idx, slots);
|
||||
if (flag & DNODE_MUST_BE_ALLOCATED) {
|
||||
slots = 1;
|
||||
|
||||
if ((flag & DNODE_MUST_BE_FREE) &&
|
||||
!dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
while (dn == DN_SLOT_UNINIT) {
|
||||
dnode_slots_hold(dnc, idx, slots);
|
||||
dnh = &dnc->dnc_children[idx];
|
||||
|
||||
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
|
||||
dn = dnh->dnh_dnode;
|
||||
break;
|
||||
} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_interior);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(EEXIST));
|
||||
} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_misses);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
if (!dnode_slots_tryenter(dnc, idx, slots)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Someone else won the race and called dnode_create()
|
||||
* after we checked DN_SLOT_IS_PTR() above but before
|
||||
* we acquired the lock.
|
||||
*/
|
||||
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
|
||||
dn = dnh->dnh_dnode;
|
||||
} else {
|
||||
dn = dnode_create(os, dn_block + idx, db,
|
||||
object, dnh);
|
||||
}
|
||||
}
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
if (dn->dn_type == DMU_OT_NONE) {
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOENT));
|
||||
}
|
||||
|
||||
DNODE_STAT_BUMP(dnode_hold_alloc_hits);
|
||||
} else if (flag & DNODE_MUST_BE_FREE) {
|
||||
|
||||
if (idx + slots - 1 >= DNODES_PER_BLOCK) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_overflow);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
while (dn == DN_SLOT_UNINIT) {
|
||||
dnode_slots_hold(dnc, idx, slots);
|
||||
|
||||
if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_misses);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
if (!dnode_slots_tryenter(dnc, idx, slots)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(ENOSPC));
|
||||
}
|
||||
|
||||
dnh = &dnc->dnc_children[idx];
|
||||
dn = dnode_create(os, dn_block + idx, db, object, dnh);
|
||||
}
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
if (!refcount_is_zero(&dn->dn_holds)) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_refcount);
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (SET_ERROR(EEXIST));
|
||||
}
|
||||
|
||||
dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
|
||||
DNODE_STAT_BUMP(dnode_hold_free_hits);
|
||||
} else {
|
||||
dbuf_rele(db, FTAG);
|
||||
return (ENOSPC);
|
||||
} else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
|
||||
!dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (ENOENT);
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
dnh = &children_dnodes->dnc_children[idx];
|
||||
dn = dnh->dnh_dnode;
|
||||
if (dn == NULL)
|
||||
dn = dnode_create(os, dn_block_begin + idx, slots, db,
|
||||
object, dnh);
|
||||
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
type = dn->dn_type;
|
||||
if (dn->dn_free_txg ||
|
||||
((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
|
||||
if (dn->dn_free_txg) {
|
||||
DNODE_STAT_BUMP(dnode_hold_free_txg);
|
||||
type = dn->dn_type;
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
dbuf_rele(db, FTAG);
|
||||
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
|
||||
}
|
||||
|
||||
if (refcount_add(&dn->dn_holds, tag) == 1)
|
||||
dbuf_add_ref(db, dnh);
|
||||
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
|
||||
/* Now we can rely on the hold to prevent the dnode from moving. */
|
||||
dnode_rele_slots(children_dnodes, idx, slots);
|
||||
dnode_slots_rele(dnc, idx, slots);
|
||||
|
||||
DNODE_VERIFY(dn);
|
||||
ASSERT3P(dn->dn_dbuf, ==, db);
|
||||
|
@ -365,7 +365,7 @@ tests = ['async_destroy_001_pos']
|
||||
[tests/functional/features/large_dnode]
|
||||
tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
|
||||
'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
|
||||
'large_dnode_007_neg', 'large_dnode_008_pos']
|
||||
'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos']
|
||||
|
||||
[tests/functional/grow_pool]
|
||||
tests = ['grow_pool_001_pos']
|
||||
|
@ -9,4 +9,5 @@ dist_pkgdata_SCRIPTS = \
|
||||
large_dnode_005_pos.ksh \
|
||||
large_dnode_006_pos.ksh \
|
||||
large_dnode_007_neg.ksh \
|
||||
large_dnode_008_pos.ksh
|
||||
large_dnode_008_pos.ksh \
|
||||
large_dnode_009_pos.ksh
|
||||
|
@ -42,6 +42,21 @@ function cleanup
|
||||
datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
|
||||
}
|
||||
|
||||
function verify_dnode_packing
|
||||
{
|
||||
zdb -dd $TEST_FS | grep -A 3 'Dnode slots' | awk '
|
||||
/Total used:/ {total_used=$NF}
|
||||
/Max used:/ {max_used=$NF}
|
||||
/Percent empty:/ {print total_used, max_used, int($NF)}
|
||||
' | while read total_used max_used pct_empty
|
||||
do
|
||||
log_note "total_used $total_used max_used $max_used pct_empty $pct_empty"
|
||||
if [ $pct_empty -gt 5 ]; then
|
||||
log_fail "Holes in dnode array: pct empty $pct_empty > 5"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
log_assert "xattrtest runs concurrently on dataset with large dnodes"
|
||||
|
||||
@ -52,9 +67,11 @@ log_must zfs set xattr=sa $TEST_FS
|
||||
for ((i=0; i < 100; i++)); do
|
||||
dir="/$TEST_FS/dir.$i"
|
||||
log_must mkdir "$dir"
|
||||
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
|
||||
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir >/dev/null 2>&1 &"
|
||||
done
|
||||
|
||||
log_must wait
|
||||
|
||||
verify_dnode_packing
|
||||
|
||||
log_pass
|
||||
|
@ -0,0 +1,71 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or http://www.opensolaris.org/os/licensing.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
|
||||
# Use is subject to license terms.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Run many xattrtests on a dataset with large dnodes and xattr=sa to
|
||||
# stress concurrent allocation of large dnodes.
|
||||
#
|
||||
|
||||
TEST_FS=$TESTPOOL/large_dnode
|
||||
|
||||
verify_runnable "both"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
log_assert "xattrtest runs concurrently on dataset with large dnodes"
|
||||
|
||||
log_must zfs create $TEST_FS
|
||||
log_must zfs set dnsize=auto $TEST_FS
|
||||
log_must zfs set xattr=sa $TEST_FS
|
||||
|
||||
for ((i=0; i < 100; i++)); do
|
||||
dir="/$TEST_FS/dir.$i"
|
||||
log_must mkdir "$dir"
|
||||
|
||||
do_unlink=""
|
||||
if [ $((RANDOM % 2)) -eq 0 ]; then
|
||||
do_unlink="-k -f 1024"
|
||||
else
|
||||
do_unlink="-f $((RANDOM % 1024))"
|
||||
fi
|
||||
log_must eval "xattrtest -R -r -y -x 1 $do_unlink -p $dir >/dev/null 2>&1 &"
|
||||
done
|
||||
|
||||
log_must wait
|
||||
|
||||
log_must zpool export $TESTPOOL
|
||||
log_must zpool import $TESTPOOL
|
||||
log_must ls -lR "/$TEST_FS/" >/dev/null 2>&1
|
||||
log_must zdb -d $TESTPOOL
|
||||
log_pass
|
Loading…
Reference in New Issue
Block a user