OpenZFS 9337 - zfs get all is slow due to uncached metadata

This project's goal is to make read-heavy channel programs and zfs(1m)
administrative commands faster by caching all the metadata that they will
need in the dbuf layer. This will prevent the data from being evicted, so
that any future call to i.e. zfs get all won't have to go to disk (very
much). There are two parts:

The dbuf_metadata_cache. We identify what to put into the cache based on
the object type of each dbuf.  Caching objset properties os
{version,normalization,utf8only,casesensitivity} in the objset_t. The reason
these needed to be cached is that although they are queried frequently,
they aren't stored in a dbuf type which we can easily recognize and cache in
the dbuf layer; instead, we have to explicitly store them. There's already
existing infrastructure for maintaining cached properties in the objset
setup code, so I simply used that.

Performance Testing:

 - Disabled kmem_flags
 - Tuned dbuf_cache_max_bytes very low (128K)
 - Tuned zfs_arc_max very low (64M)

Created test pool with 400 filesystems, and 100 snapshots per filesystem.
Later on in testing, added 600 more filesystems (with no snapshots) to make
sure scaling didn't look different between snapshots and filesystems.

Results:

    | Test                   | Time (trunk / diff) | I/Os (trunk / diff) |
    +------------------------+---------------------+---------------------+
    | zpool import           |     0:05 / 0:06     |    12.9k / 12.9k    |
    | zfs get all (uncached) |     1:36 / 0:53     |    16.7k / 5.7k     |
    | zfs get all (cached)   |     1:36 / 0:51     |    16.0k / 6.0k     |

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Richard Lowe <richlowe@richlowe.net>
Ported-by: Alek Pinchuk <apinchuk@datto.com>
Signed-off-by: Alek Pinchuk <apinchuk@datto.com>

OpenZFS-issue: https://illumos.org/issues/9337
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7dec52f
Closes #7668
This commit is contained in:
Matthew Ahrens 2018-07-10 13:49:50 -04:00 committed by Brian Behlendorf
parent e4e94ca315
commit 2e5dc449c1
17 changed files with 405 additions and 164 deletions

View File

@ -84,6 +84,13 @@ typedef enum dbuf_states {
DB_EVICTING DB_EVICTING
} dbuf_states_t; } dbuf_states_t;
typedef enum dbuf_cached_state {
DB_NO_CACHE = -1,
DB_DBUF_CACHE,
DB_DBUF_METADATA_CACHE,
DB_CACHE_MAX
} dbuf_cached_state_t;
struct dnode; struct dnode;
struct dmu_tx; struct dmu_tx;
@ -240,11 +247,12 @@ typedef struct dmu_buf_impl {
*/ */
avl_node_t db_link; avl_node_t db_link;
/* /* Link in dbuf_cache or dbuf_metadata_cache */
* Link in dbuf_cache.
*/
multilist_node_t db_cache_link; multilist_node_t db_cache_link;
/* Tells us which dbuf cache this dbuf is in, if any */
dbuf_cached_state_t db_caching_status;
/* Data which is unique to data (leaf) blocks: */ /* Data which is unique to data (leaf) blocks: */
/* User callback information. */ /* User callback information. */
@ -305,7 +313,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
uint64_t dbuf_refcount(dmu_buf_impl_t *db); uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag); void dbuf_rele(dmu_buf_impl_t *db, void *tag);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid); uint64_t blkid);

View File

@ -107,7 +107,8 @@ typedef enum dmu_object_byteswap {
/* /*
* Defines a uint8_t object type. Object types specify if the data * Defines a uint8_t object type. Object types specify if the data
* in the object is metadata (boolean) and how to byteswap the data * in the object is metadata (boolean) and how to byteswap the data
* (dmu_object_byteswap_t). * (dmu_object_byteswap_t). All of the types created by this method
* are cached in the dbuf metadata cache.
*/ */
#define DMU_OT(byteswap, metadata, encrypted) \ #define DMU_OT(byteswap, metadata, encrypted) \
(DMU_OT_NEWTYPE | \ (DMU_OT_NEWTYPE | \
@ -119,6 +120,9 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
(ot) < DMU_OT_NUMTYPES) (ot) < DMU_OT_NUMTYPES)
#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
/* /*
* MDB doesn't have dmu_ot; it defines these macros itself. * MDB doesn't have dmu_ot; it defines these macros itself.
*/ */
@ -883,6 +887,7 @@ typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
typedef struct dmu_object_type_info { typedef struct dmu_object_type_info {
dmu_object_byteswap_t ot_byteswap; dmu_object_byteswap_t ot_byteswap;
boolean_t ot_metadata; boolean_t ot_metadata;
boolean_t ot_dbuf_metadata_cache;
boolean_t ot_encrypt; boolean_t ot_encrypt;
char *ot_name; char *ot_name;
} dmu_object_type_info_t; } dmu_object_type_info_t;

View File

@ -38,6 +38,7 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/sa.h> #include <sys/sa.h>
#include <sys/zfs_ioctl.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -90,6 +91,7 @@ typedef struct objset_phys {
typedef int (*dmu_objset_upgrade_cb_t)(objset_t *); typedef int (*dmu_objset_upgrade_cb_t)(objset_t *);
#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
struct objset { struct objset {
/* Immutable: */ /* Immutable: */
struct dsl_dataset *os_dsl_dataset; struct dsl_dataset *os_dsl_dataset;
@ -125,6 +127,16 @@ struct objset {
zfs_sync_type_t os_sync; zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata; zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize; int os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before
* being initialized with their real values, their values are
* OBJSET_PROP_UNINITIALIZED.
*/
uint64_t os_version;
uint64_t os_normalization;
uint64_t os_utf8only;
uint64_t os_casesensitivity;
/* /*
* Pointer is constant; the blkptr it points to is protected by * Pointer is constant; the blkptr it points to is protected by

View File

@ -408,7 +408,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
void *ref, dnode_t **dnp); void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref); boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref);
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_rele_and_unlock(dnode_t *dn, void *tag);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,

View File

@ -488,7 +488,6 @@ extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *);
extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); extern int zfs_secpolicy_destroy_perms(const char *, cred_t *);
extern void zfs_unmount_snap(const char *); extern void zfs_unmount_snap(const char *);
extern void zfs_destroy_unmount_origin(const char *); extern void zfs_destroy_unmount_origin(const char *);
extern boolean_t dataset_name_hidden(const char *);
extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); extern int getzfsvfs_impl(struct objset *, struct zfsvfs **);
extern int getzfsvfs(const char *, struct zfsvfs **); extern int getzfsvfs(const char *, struct zfsvfs **);

View File

@ -38,6 +38,9 @@ extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *);
extern int zfs_zpl_version_map(int spa_version); extern int zfs_zpl_version_map(int spa_version);
extern int zfs_spa_version_map(int zpl_version); extern int zfs_spa_version_map(int zpl_version);
extern boolean_t zfs_dataset_name_hidden(const char *);
#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 #define ZFS_NUM_LEGACY_HISTORY_EVENTS 41
extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];

View File

@ -41,6 +41,21 @@ kstat.
Default value: \fB0\fR. Default value: \fB0\fR.
.RE .RE
.sp
.ne 2
.na
\fBdbuf_metadata_cache_max_bytes\fR (ulong)
.ad
.RS 12n
Maximum size in bytes of the metadata dbuf cache. When \fB0\fR this value will
default to \fB1/2^dbuf_cache_shift\fR (1/16) of the target ARC size, otherwise
the provided value in bytes will be used. The behavior of the metadata dbuf
cache and its associated settings can be observed via the
\fB/proc/spl/kstat/zfs/dbufstats\fR kstat.
.sp
Default value: \fB0\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na
@ -77,6 +92,18 @@ of the target arc size.
Default value: \fB5\fR. Default value: \fB5\fR.
.RE .RE
.sp
.ne 2
.na
\fBdbuf_metadata_cache_shift\fR (int)
.ad
.RS 12n
Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR,
to a log2 fraction of the target arc size.
.sp
Default value: \fB6\fR.
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -204,10 +204,28 @@ const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
"pool split", "pool split",
}; };
boolean_t
zfs_dataset_name_hidden(const char *name)
{
/*
* Skip over datasets that are not visible in this zone,
* internal datasets (which have a $ in their name), and
* temporary datasets (which have a % in their name).
*/
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
return (B_TRUE);
return (B_FALSE);
}
#if defined(_KERNEL) #if defined(_KERNEL)
EXPORT_SYMBOL(zfs_allocatable_devs); EXPORT_SYMBOL(zfs_allocatable_devs);
EXPORT_SYMBOL(zpool_get_load_policy); EXPORT_SYMBOL(zpool_get_load_policy);
EXPORT_SYMBOL(zfs_zpl_version_map); EXPORT_SYMBOL(zfs_zpl_version_map);
EXPORT_SYMBOL(zfs_spa_version_map); EXPORT_SYMBOL(zfs_spa_version_map);
EXPORT_SYMBOL(zfs_history_event_names); EXPORT_SYMBOL(zfs_history_event_names);
EXPORT_SYMBOL(zfs_dataset_name_hidden);
#endif #endif

View File

@ -49,6 +49,7 @@
#include <sys/abd.h> #include <sys/abd.h>
#include <sys/vdev.h> #include <sys/vdev.h>
#include <sys/cityhash.h> #include <sys/cityhash.h>
#include <sys/spa_impl.h>
kstat_t *dbuf_ksp; kstat_t *dbuf_ksp;
@ -94,6 +95,18 @@ typedef struct dbuf_stats {
* already created and in the dbuf hash table. * already created and in the dbuf hash table.
*/ */
kstat_named_t hash_insert_race; kstat_named_t hash_insert_race;
/*
* Statistics about the size of the metadata dbuf cache.
*/
kstat_named_t metadata_cache_count;
kstat_named_t metadata_cache_size_bytes;
kstat_named_t metadata_cache_size_bytes_max;
/*
* For diagnostic purposes, this is incremented whenever we can't add
* something to the metadata cache because it's full, and instead put
* the data in the regular dbuf cache.
*/
kstat_named_t metadata_cache_overflow;
} dbuf_stats_t; } dbuf_stats_t;
dbuf_stats_t dbuf_stats = { dbuf_stats_t dbuf_stats = {
@ -113,7 +126,11 @@ dbuf_stats_t dbuf_stats = {
{ "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 } { "hash_insert_race", KSTAT_DATA_UINT64 },
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }
}; };
#define DBUF_STAT_INCR(stat, val) \ #define DBUF_STAT_INCR(stat, val) \
@ -175,24 +192,51 @@ static kcondvar_t dbuf_evict_cv;
static boolean_t dbuf_evict_thread_exit; static boolean_t dbuf_evict_thread_exit;
/* /*
* LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * There are two dbuf caches; each dbuf can only be in one of them at a time.
* are not currently held but have been recently released. These dbufs *
* are not eligible for arc eviction until they are aged out of the cache. * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
* Dbufs are added to the dbuf cache once the last hold is released. If a * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
* dbuf is later accessed and still exists in the dbuf cache, then it will * that represent the metadata that describes filesystems/snapshots/
* be removed from the cache and later re-added to the head of the cache. * bookmarks/properties/etc. We only evict from this cache when we export a
* Dbufs that are aged out of the cache will be immediately destroyed and * pool, to short-circuit as much I/O as possible for all administrative
* become eligible for arc eviction. * commands that need the metadata. There is no eviction policy for this
* cache, because we try to only include types in it which would occupy a
* very small amount of space per object but create a large impact on the
* performance of these commands. Instead, after it reaches a maximum size
* (which should only happen on very small memory systems with a very large
* number of filesystem objects), we stop taking new dbufs into the
* metadata cache, instead putting them in the normal dbuf cache.
*
* 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
* are not currently held but have been recently released. These dbufs
* are not eligible for arc eviction until they are aged out of the cache.
* Dbufs that are aged out of the cache will be immediately destroyed and
* become eligible for arc eviction.
*
* Dbufs are added to these caches once the last hold is released. If a dbuf is
* later accessed and still exists in the dbuf cache, then it will be removed
* from the cache and later re-added to the head of the cache.
*
* If a given dbuf meets the requirements for the metadata cache, it will go
* there, otherwise it will be considered for the generic LRU dbuf cache. The
* caches and the refcounts tracking their sizes are stored in an array indexed
* by those caches' matching enum values (from dbuf_cached_state_t).
*/ */
static multilist_t *dbuf_cache; typedef struct dbuf_cache {
static refcount_t dbuf_cache_size; multilist_t *cache;
unsigned long dbuf_cache_max_bytes = 0; refcount_t size;
} dbuf_cache_t;
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
/* Set the default size of the dbuf cache to log2 fraction of arc size. */ /* Size limits for the caches */
unsigned long dbuf_cache_max_bytes = 0;
unsigned long dbuf_metadata_cache_max_bytes = 0;
/* Set the default sizes of the caches to log2 fraction of arc size */
int dbuf_cache_shift = 5; int dbuf_cache_shift = 5;
int dbuf_metadata_cache_shift = 6;
/* /*
* The dbuf cache uses a three-stage eviction policy: * The LRU dbuf cache uses a three-stage eviction policy:
* - A low water marker designates when the dbuf eviction thread * - A low water marker designates when the dbuf eviction thread
* should stop evicting from the dbuf cache. * should stop evicting from the dbuf cache.
* - When we reach the maximum size (aka mid water mark), we * - When we reach the maximum size (aka mid water mark), we
@ -381,6 +425,39 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
return (NULL); return (NULL);
} }
/*
* This returns whether this dbuf should be stored in the metadata cache, which
* is based on whether it's from one of the dnode types that store data related
* to traversing dataset hierarchies.
*/
static boolean_t
dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
{
DB_DNODE_ENTER(db);
dmu_object_type_t type = DB_DNODE(db)->dn_type;
DB_DNODE_EXIT(db);
/* Check if this dbuf is one of the types we care about */
if (DMU_OT_IS_METADATA_CACHED(type)) {
/* If we hit this, then we set something up wrong in dmu_ot */
ASSERT(DMU_OT_IS_METADATA(type));
/*
* Sanity check for small-memory systems: don't allocate too
* much memory for this purpose.
*/
if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
dbuf_metadata_cache_max_bytes) {
DBUF_STAT_BUMP(metadata_cache_overflow);
return (B_FALSE);
}
return (B_TRUE);
}
return (B_FALSE);
}
/* /*
* Remove an entry from the hash table. It must be in the EVICTING state. * Remove an entry from the hash table. It must be in the EVICTING state.
*/ */
@ -574,13 +651,15 @@ dbuf_cache_lowater_bytes(void)
static inline boolean_t static inline boolean_t
dbuf_cache_above_hiwater(void) dbuf_cache_above_hiwater(void)
{ {
return (refcount_count(&dbuf_cache_size) > dbuf_cache_hiwater_bytes()); return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_hiwater_bytes());
} }
static inline boolean_t static inline boolean_t
dbuf_cache_above_lowater(void) dbuf_cache_above_lowater(void)
{ {
return (refcount_count(&dbuf_cache_size) > dbuf_cache_lowater_bytes()); return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_lowater_bytes());
} }
/* /*
@ -589,8 +668,9 @@ dbuf_cache_above_lowater(void)
static void static void
dbuf_evict_one(void) dbuf_evict_one(void)
{ {
int idx = multilist_get_random_index(dbuf_cache); int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx); multilist_sublist_t *mls = multilist_sublist_lock(
dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
@ -605,15 +685,17 @@ dbuf_evict_one(void)
if (db != NULL) { if (db != NULL) {
multilist_sublist_remove(mls, db); multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
(void) refcount_remove_many(&dbuf_cache_size, (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
db->db.db_size, db); db->db.db_size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size); db->db.db_size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db); dbuf_destroy(db);
DBUF_STAT_MAX(cache_size_bytes_max, DBUF_STAT_MAX(cache_size_bytes_max,
refcount_count(&dbuf_cache_size)); refcount_count(&dbuf_caches[DB_DBUF_CACHE].size));
DBUF_STAT_BUMP(cache_total_evicts); DBUF_STAT_BUMP(cache_total_evicts);
} else { } else {
multilist_sublist_unlock(mls); multilist_sublist_unlock(mls);
@ -676,7 +758,8 @@ dbuf_evict_notify(void)
* because it's OK to occasionally make the wrong decision here, * because it's OK to occasionally make the wrong decision here,
* and grabbing the lock results in massive lock contention. * and grabbing the lock results in massive lock contention.
*/ */
if (refcount_count(&dbuf_cache_size) > dbuf_cache_target_bytes()) { if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_target_bytes()) {
if (dbuf_cache_above_hiwater()) if (dbuf_cache_above_hiwater())
dbuf_evict_one(); dbuf_evict_one();
cv_signal(&dbuf_evict_cv); cv_signal(&dbuf_evict_cv);
@ -691,8 +774,10 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
if (rw == KSTAT_WRITE) { if (rw == KSTAT_WRITE) {
return (SET_ERROR(EACCES)); return (SET_ERROR(EACCES));
} else { } else {
ds->metadata_cache_size_bytes.value.ui64 =
refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
ds->cache_size_bytes.value.ui64 = ds->cache_size_bytes.value.ui64 =
refcount_count(&dbuf_cache_size); refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
@ -746,15 +831,21 @@ retry:
dbuf_stats_init(h); dbuf_stats_init(h);
/* /*
* Setup the parameters for the dbuf cache. We set the size of the * Setup the parameters for the dbuf caches. We set the sizes of the
* dbuf cache to 1/32nd (default) of the target size of the ARC. If * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
* the value has been specified as a module option and it's not * of the target size of the ARC. If the values has been specified as
* greater than the target size of the ARC, then we honor that value. * a module option and they're not greater than the target size of the
* ARC, then we honor that value.
*/ */
if (dbuf_cache_max_bytes == 0 || if (dbuf_cache_max_bytes == 0 ||
dbuf_cache_max_bytes >= arc_target_bytes()) { dbuf_cache_max_bytes >= arc_target_bytes()) {
dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift; dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift;
} }
if (dbuf_metadata_cache_max_bytes == 0 ||
dbuf_metadata_cache_max_bytes >= arc_target_bytes()) {
dbuf_metadata_cache_max_bytes =
arc_target_bytes() >> dbuf_metadata_cache_shift;
}
/* /*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@ -762,10 +853,13 @@ retry:
*/ */
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t), for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
offsetof(dmu_buf_impl_t, db_cache_link), dbuf_caches[dcs].cache =
dbuf_cache_multilist_index_func); multilist_create(sizeof (dmu_buf_impl_t),
refcount_create(&dbuf_cache_size); offsetof(dmu_buf_impl_t, db_cache_link),
dbuf_cache_multilist_index_func);
refcount_create(&dbuf_caches[dcs].size);
}
dbuf_evict_thread_exit = B_FALSE; dbuf_evict_thread_exit = B_FALSE;
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
@ -827,8 +921,10 @@ dbuf_fini(void)
mutex_destroy(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv); cv_destroy(&dbuf_evict_cv);
refcount_destroy(&dbuf_cache_size); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
multilist_destroy(dbuf_cache); refcount_destroy(&dbuf_caches[dcs].size);
multilist_destroy(dbuf_caches[dcs].cache);
}
if (dbuf_ksp != NULL) { if (dbuf_ksp != NULL) {
kstat_delete(dbuf_ksp); kstat_delete(dbuf_ksp);
@ -1116,7 +1212,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
} }
cv_broadcast(&db->db_changed); cv_broadcast(&db->db_changed);
dbuf_rele_and_unlock(db, NULL, B_FALSE); dbuf_rele_and_unlock(db, NULL);
} }
@ -2430,13 +2526,23 @@ dbuf_destroy(dmu_buf_impl_t *db)
dbuf_clear_data(db); dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) { if (multilist_link_active(&db->db_cache_link)) {
multilist_remove(dbuf_cache, db); ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
(void) refcount_remove_many(&dbuf_cache_size, db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
(void) refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db); db->db.db_size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], DBUF_STAT_BUMPDOWN(metadata_cache_count);
db->db.db_size); } else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
}
db->db_caching_status = DB_NO_CACHE;
} }
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@ -2474,7 +2580,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
* release any lock. * release any lock.
*/ */
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, db, B_TRUE); dnode_rele_and_unlock(dn, db);
db->db_dnode_handle = NULL; db->db_dnode_handle = NULL;
dbuf_hash_remove(db); dbuf_hash_remove(db);
@ -2491,6 +2597,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_hash_next == NULL); ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL); ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL); ASSERT(db->db_data_pending == NULL);
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link)); ASSERT(!multilist_link_active(&db->db_cache_link));
kmem_cache_free(dbuf_kmem_cache, db); kmem_cache_free(dbuf_kmem_cache, db);
@ -2502,7 +2609,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
*/ */
if (parent && parent != dndb) { if (parent && parent != dndb) {
mutex_enter(&parent->db_mtx); mutex_enter(&parent->db_mtx);
dbuf_rele_and_unlock(parent, db, B_TRUE); dbuf_rele_and_unlock(parent, db);
} }
} }
@ -2640,6 +2747,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID; db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
db->db_caching_status = DB_NO_CACHE;
/* the bonus dbuf is not placed in the hash table */ /* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
return (db); return (db);
@ -2673,6 +2781,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
avl_add(&dn->dn_dbufs, db); avl_add(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED; db->db_state = DB_UNCACHED;
db->db_caching_status = DB_NO_CACHE;
mutex_exit(&dn->dn_dbufs_mtx); mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
@ -3059,13 +3168,25 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
if (multilist_link_active(&dh->dh_db->db_cache_link)) { if (multilist_link_active(&dh->dh_db->db_cache_link)) {
ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); ASSERT(refcount_is_zero(&dh->dh_db->db_holds));
multilist_remove(dbuf_cache, dh->dh_db); ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
(void) refcount_remove_many(&dbuf_cache_size, dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(
dbuf_caches[dh->dh_db->db_caching_status].cache,
dh->dh_db);
(void) refcount_remove_many(
&dbuf_caches[dh->dh_db->db_caching_status].size,
dh->dh_db->db.db_size, dh->dh_db); dh->dh_db->db.db_size, dh->dh_db);
DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count); if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], DBUF_STAT_BUMPDOWN(metadata_cache_count);
dh->dh_db->db.db_size); } else {
DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
dh->dh_db->db.db_size);
}
dh->dh_db->db_caching_status = DB_NO_CACHE;
} }
(void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
DBUF_VERIFY(dh->dh_db); DBUF_VERIFY(dh->dh_db);
@ -3230,7 +3351,7 @@ void
dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_rele(dmu_buf_impl_t *db, void *tag)
{ {
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, tag, B_FALSE); dbuf_rele_and_unlock(db, tag);
} }
void void
@ -3253,7 +3374,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
* *
*/ */
void void
dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
{ {
int64_t holds; int64_t holds;
@ -3343,19 +3464,40 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
db->db_pending_evict) { db->db_pending_evict) {
dbuf_destroy(db); dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) { } else if (!multilist_link_active(&db->db_cache_link)) {
multilist_insert(dbuf_cache, db); ASSERT3U(db->db_caching_status, ==,
(void) refcount_add_many(&dbuf_cache_size, DB_NO_CACHE);
dbuf_cached_state_t dcs =
dbuf_include_in_metadata_cache(db) ?
DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
db->db_caching_status = dcs;
multilist_insert(dbuf_caches[dcs].cache, db);
(void) refcount_add_many(&dbuf_caches[dcs].size,
db->db.db_size, db); db->db.db_size, db);
DBUF_STAT_BUMP(cache_levels[db->db_level]);
DBUF_STAT_BUMP(cache_count); if (dcs == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_INCR(cache_levels_bytes[db->db_level], DBUF_STAT_BUMP(metadata_cache_count);
db->db.db_size); DBUF_STAT_MAX(
DBUF_STAT_MAX(cache_size_bytes_max, metadata_cache_size_bytes_max,
refcount_count(&dbuf_cache_size)); refcount_count(
&dbuf_caches[dcs].size));
} else {
DBUF_STAT_BUMP(
cache_levels[db->db_level]);
DBUF_STAT_BUMP(cache_count);
DBUF_STAT_INCR(
cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_MAX(cache_size_bytes_max,
refcount_count(
&dbuf_caches[dcs].size));
}
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
if (!evicting) if (db->db_caching_status == DB_DBUF_CACHE) {
dbuf_evict_notify(); dbuf_evict_notify();
}
} }
if (do_arc_evict) if (do_arc_evict)
@ -3706,7 +3848,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
kmem_free(dr, sizeof (dbuf_dirty_record_t)); kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0); ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1; db->db_dirtycnt -= 1;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
return; return;
} }
@ -4081,7 +4223,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0); ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1; db->db_dirtycnt -= 1;
db->db_data_pending = NULL; db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
} }
static void static void
@ -4445,8 +4587,17 @@ MODULE_PARM_DESC(dbuf_cache_lowater_pct,
"Percentage below dbuf_cache_max_bytes when the evict thread stops " "Percentage below dbuf_cache_max_bytes when the evict thread stops "
"evicting dbufs."); "evicting dbufs.");
module_param(dbuf_metadata_cache_max_bytes, ulong, 0644);
MODULE_PARM_DESC(dbuf_metadata_cache_max_bytes,
"Maximum size in bytes of the dbuf metadata cache.");
module_param(dbuf_cache_shift, int, 0644); module_param(dbuf_cache_shift, int, 0644);
MODULE_PARM_DESC(dbuf_cache_shift, MODULE_PARM_DESC(dbuf_cache_shift,
"Set the size of the dbuf cache to a log2 fraction of arc size."); "Set the size of the dbuf cache to a log2 fraction of arc size.");
module_param(dbuf_metadata_cache_shift, int, 0644);
MODULE_PARM_DESC(dbuf_cache_shift,
"Set the size of the dbuf metadata cache to a log2 fraction of "
"arc size.");
/* END CSTYLED */ /* END CSTYLED */
#endif #endif

View File

@ -81,60 +81,60 @@ int zfs_dmu_offset_next_sync = 0;
int zfs_object_remap_one_indirect_delay_ticks = 0; int zfs_object_remap_one_indirect_delay_ticks = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" },
{ DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" },
{ DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" },
{ DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" },
{ DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" },
{ DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" },
{ DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" },
{ DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" },
{ DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"},
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" },
{ DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" },
{ DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" },
{ DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" },
{ DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" }
}; };
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {

View File

@ -471,6 +471,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_phys = os->os_phys_buf->b_data; os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size); bzero(os->os_phys, size);
} }
/*
* These properties will be filled in by the logic in zfs_get_zplprop()
* when they are queried for the first time.
*/
os->os_version = OBJSET_PROP_UNINITIALIZED;
os->os_normalization = OBJSET_PROP_UNINITIALIZED;
os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
/* /*
* Note: the changed_cb will be called once before the register * Note: the changed_cb will be called once before the register

View File

@ -1574,11 +1574,11 @@ void
dnode_rele(dnode_t *dn, void *tag) dnode_rele(dnode_t *dn, void *tag)
{ {
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, tag, B_FALSE); dnode_rele_and_unlock(dn, tag);
} }
void void
dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) dnode_rele_and_unlock(dnode_t *dn, void *tag)
{ {
uint64_t refs; uint64_t refs;
/* Get while the hold prevents the dnode from moving. */ /* Get while the hold prevents the dnode from moving. */
@ -1610,7 +1610,7 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
* asserted anyway when the handle gets destroyed. * asserted anyway when the handle gets destroyed.
*/ */
mutex_enter(&db->db_mtx); mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, dnh, evicting); dbuf_rele_and_unlock(db, dnh);
} }
} }

View File

@ -438,7 +438,7 @@ dnode_evict_dbufs(dnode_t *dn)
* flow would look like: * flow would look like:
* *
* dbuf_destroy(): * dbuf_destroy():
* dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): * dnode_rele_and_unlock(parent_dbuf):
* if (!cacheable || pending_evict) * if (!cacheable || pending_evict)
* dbuf_destroy() * dbuf_destroy()
*/ */
@ -502,7 +502,7 @@ dnode_undirty_dbufs(list_t *list)
list_destroy(&dr->dt.di.dr_children); list_destroy(&dr->dt.di.dr_children);
} }
kmem_free(dr, sizeof (dbuf_dirty_record_t)); kmem_free(dr, sizeof (dbuf_dirty_record_t));
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
} }
} }

View File

@ -33,6 +33,8 @@
#include <sys/zcp.h> #include <sys/zcp.h>
#include "zfs_comutil.h"
typedef int (zcp_list_func_t)(lua_State *); typedef int (zcp_list_func_t)(lua_State *);
typedef struct zcp_list_info { typedef struct zcp_list_info {
const char *name; const char *name;
@ -232,20 +234,6 @@ zcp_snapshots_list(lua_State *state)
return (1); return (1);
} }
/*
* Note: channel programs only run in the global zone, so all datasets
* are visible to this zone.
*/
static boolean_t
dataset_name_hidden(const char *name)
{
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
return (B_FALSE);
}
static int static int
zcp_children_iter(lua_State *state) zcp_children_iter(lua_State *state)
{ {
@ -275,7 +263,7 @@ zcp_children_iter(lua_State *state)
do { do {
err = dmu_dir_list_next(os, err = dmu_dir_list_next(os,
sizeof (childname) - (p - childname), p, NULL, &cursor); sizeof (childname) - (p - childname), p, NULL, &cursor);
} while (err == 0 && dataset_name_hidden(childname)); } while (err == 0 && zfs_dataset_name_hidden(childname));
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
if (err == ENOENT) { if (err == ENOENT) {

View File

@ -2252,23 +2252,6 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
return (err); return (err);
} }
boolean_t
dataset_name_hidden(const char *name)
{
/*
* Skip over datasets that are not visible in this zone,
* internal datasets (which have a $ in their name), and
* temporary datasets (which have a % in their name).
*/
if (strchr(name, '$') != NULL)
return (B_TRUE);
if (strchr(name, '%') != NULL)
return (B_TRUE);
if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
return (B_TRUE);
return (B_FALSE);
}
/* /*
* inputs: * inputs:
* zc_name name of filesystem * zc_name name of filesystem
@ -2308,7 +2291,7 @@ top:
NULL, &zc->zc_cookie); NULL, &zc->zc_cookie);
if (error == ENOENT) if (error == ENOENT)
error = SET_ERROR(ESRCH); error = SET_ERROR(ESRCH);
} while (error == 0 && dataset_name_hidden(zc->zc_name)); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name));
dmu_objset_rele(os, FTAG); dmu_objset_rele(os, FTAG);
/* /*

View File

@ -2234,6 +2234,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
dmu_tx_commit(tx); dmu_tx_commit(tx);
zfsvfs->z_version = newvers; zfsvfs->z_version = newvers;
os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs); zfs_set_fuid_feature(zfsvfs);
@ -2246,13 +2247,42 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
int int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{ {
const char *pname; uint64_t *cached_copy = NULL;
int error = SET_ERROR(ENOENT);
/* /*
* Look up the file system's value for the property. For the * Figure out where in the objset_t the cached copy would live, if it
* version property, we look up a slightly different string. * is available for the requested property.
*/ */
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION) if (prop == ZFS_PROP_VERSION)
pname = ZPL_VERSION_STR; pname = ZPL_VERSION_STR;
else else
@ -2284,6 +2314,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
} }
error = 0; error = 0;
} }
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error); return (error);
} }

View File

@ -58,7 +58,7 @@ function testdbufstat # stat_name dbufstat_filter
from_dbufstat=$(grep -w "$name" "$DBUFSTATS_FILE" | awk '{ print $3 }') from_dbufstat=$(grep -w "$name" "$DBUFSTATS_FILE" | awk '{ print $3 }')
from_dbufs=$(dbufstat.py -bxn -i "$DBUFS_FILE" "$filter" | wc -l) from_dbufs=$(dbufstat.py -bxn -i "$DBUFS_FILE" "$filter" | wc -l)
within_tolerance $from_dbufstat $from_dbufs 5 \ within_tolerance $from_dbufstat $from_dbufs 9 \
|| log_fail "Stat $name exceeded tolerance" || log_fail "Stat $name exceeded tolerance"
} }