mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-26 19:19:32 +03:00
a10e552b99
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and request sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned unless the direct property is set to always (see below). For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. For both O_DIRECT writes and reads the O_DIRECT flag will be ignored in the event that file contents are mmap'ed. In this case, all requests that are at least PAGE_SIZE aligned will just fall back to the buffered paths. If the request however is not PAGE_SIZE aligned, EINVAL will be returned as always regardless if the file's contents are mmap'ed. Since O_DIRECT writes go through the normal ZIO pipeline, the following operations are supported just as with normal buffered writes: Checksum Compression Encryption Erasure Coding There is one caveat for the data integrity of O_DIRECT writes that is distinct for each of the OS's supported by ZFS. FreeBSD - FreeBSD is able to place user pages under write protection so any data in the user buffers and written directly down to the VDEV disks is guaranteed to not change. There is no concern with data integrity and O_DIRECT writes. Linux - Linux is not able to place anonymous user pages under write protection. Because of this, if the user decides to manipulate the page contents while the write operation is occurring, data integrity can not be guaranteed. However, there is a module parameter `zfs_vdev_direct_write_verify` that controls the if a O_DIRECT writes that can occur to a top-level VDEV before a checksum verify is run before the contents of the I/O buffer are committed to disk. In the event of a checksum verification failure the write will return EIO. The number of O_DIRECT write checksum verification errors can be observed by doing `zpool status -d`, which will list all verification errors that have occurred on a top-level VDEV. Along with `zpool status`, a ZED event will be issues as `dio_verify` when a checksum verification error occurs. ZVOLs and dedup is not currently supported with Direct I/O. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. standard - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and will do O_DIRECT if the alignment restrictions are met otherwise will redirect through the ARC. This property will not allow a request to fail. There is also a module parameter zfs_dio_enabled that can be used to force all reads and writes through the ARC. By setting this module parameter to 0, it mimics as if the direct dataset property is set to disabled. Reviewed-by: Brian Behlendorf <behlendorf@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Co-authored-by: Mark Maybee <mark.maybee@delphix.com> Co-authored-by: Matt Macy <mmacy@FreeBSD.org> Co-authored-by: Brian Behlendorf <behlendorf@llnl.gov> Closes #10018
5428 lines
157 KiB
C
5428 lines
157 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
/*
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
|
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
|
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
|
* Copyright (c) 2019, Klara Inc.
|
|
* Copyright (c) 2019, Allan Jude
|
|
* Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
|
|
*/
|
|
|
|
#include <sys/zfs_context.h>
|
|
#include <sys/arc.h>
|
|
#include <sys/dmu.h>
|
|
#include <sys/dmu_send.h>
|
|
#include <sys/dmu_impl.h>
|
|
#include <sys/dbuf.h>
|
|
#include <sys/dmu_objset.h>
|
|
#include <sys/dsl_dataset.h>
|
|
#include <sys/dsl_dir.h>
|
|
#include <sys/dmu_tx.h>
|
|
#include <sys/spa.h>
|
|
#include <sys/zio.h>
|
|
#include <sys/dmu_zfetch.h>
|
|
#include <sys/sa.h>
|
|
#include <sys/sa_impl.h>
|
|
#include <sys/zfeature.h>
|
|
#include <sys/blkptr.h>
|
|
#include <sys/range_tree.h>
|
|
#include <sys/trace_zfs.h>
|
|
#include <sys/callb.h>
|
|
#include <sys/abd.h>
|
|
#include <sys/brt.h>
|
|
#include <sys/vdev.h>
|
|
#include <cityhash.h>
|
|
#include <sys/spa_impl.h>
|
|
#include <sys/wmsum.h>
|
|
#include <sys/vdev_impl.h>
|
|
|
|
static kstat_t *dbuf_ksp;
|
|
|
|
typedef struct dbuf_stats {
|
|
/*
|
|
* Various statistics about the size of the dbuf cache.
|
|
*/
|
|
kstat_named_t cache_count;
|
|
kstat_named_t cache_size_bytes;
|
|
kstat_named_t cache_size_bytes_max;
|
|
/*
|
|
* Statistics regarding the bounds on the dbuf cache size.
|
|
*/
|
|
kstat_named_t cache_target_bytes;
|
|
kstat_named_t cache_lowater_bytes;
|
|
kstat_named_t cache_hiwater_bytes;
|
|
/*
|
|
* Total number of dbuf cache evictions that have occurred.
|
|
*/
|
|
kstat_named_t cache_total_evicts;
|
|
/*
|
|
* The distribution of dbuf levels in the dbuf cache and
|
|
* the total size of all dbufs at each level.
|
|
*/
|
|
kstat_named_t cache_levels[DN_MAX_LEVELS];
|
|
kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
|
|
/*
|
|
* Statistics about the dbuf hash table.
|
|
*/
|
|
kstat_named_t hash_hits;
|
|
kstat_named_t hash_misses;
|
|
kstat_named_t hash_collisions;
|
|
kstat_named_t hash_elements;
|
|
kstat_named_t hash_elements_max;
|
|
/*
|
|
* Number of sublists containing more than one dbuf in the dbuf
|
|
* hash table. Keep track of the longest hash chain.
|
|
*/
|
|
kstat_named_t hash_chains;
|
|
kstat_named_t hash_chain_max;
|
|
/*
|
|
* Number of times a dbuf_create() discovers that a dbuf was
|
|
* already created and in the dbuf hash table.
|
|
*/
|
|
kstat_named_t hash_insert_race;
|
|
/*
|
|
* Number of entries in the hash table dbuf and mutex arrays.
|
|
*/
|
|
kstat_named_t hash_table_count;
|
|
kstat_named_t hash_mutex_count;
|
|
/*
|
|
* Statistics about the size of the metadata dbuf cache.
|
|
*/
|
|
kstat_named_t metadata_cache_count;
|
|
kstat_named_t metadata_cache_size_bytes;
|
|
kstat_named_t metadata_cache_size_bytes_max;
|
|
/*
|
|
* For diagnostic purposes, this is incremented whenever we can't add
|
|
* something to the metadata cache because it's full, and instead put
|
|
* the data in the regular dbuf cache.
|
|
*/
|
|
kstat_named_t metadata_cache_overflow;
|
|
} dbuf_stats_t;
|
|
|
|
dbuf_stats_t dbuf_stats = {
|
|
{ "cache_count", KSTAT_DATA_UINT64 },
|
|
{ "cache_size_bytes", KSTAT_DATA_UINT64 },
|
|
{ "cache_size_bytes_max", KSTAT_DATA_UINT64 },
|
|
{ "cache_target_bytes", KSTAT_DATA_UINT64 },
|
|
{ "cache_lowater_bytes", KSTAT_DATA_UINT64 },
|
|
{ "cache_hiwater_bytes", KSTAT_DATA_UINT64 },
|
|
{ "cache_total_evicts", KSTAT_DATA_UINT64 },
|
|
{ { "cache_levels_N", KSTAT_DATA_UINT64 } },
|
|
{ { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } },
|
|
{ "hash_hits", KSTAT_DATA_UINT64 },
|
|
{ "hash_misses", KSTAT_DATA_UINT64 },
|
|
{ "hash_collisions", KSTAT_DATA_UINT64 },
|
|
{ "hash_elements", KSTAT_DATA_UINT64 },
|
|
{ "hash_elements_max", KSTAT_DATA_UINT64 },
|
|
{ "hash_chains", KSTAT_DATA_UINT64 },
|
|
{ "hash_chain_max", KSTAT_DATA_UINT64 },
|
|
{ "hash_insert_race", KSTAT_DATA_UINT64 },
|
|
{ "hash_table_count", KSTAT_DATA_UINT64 },
|
|
{ "hash_mutex_count", KSTAT_DATA_UINT64 },
|
|
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
|
|
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
|
|
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
|
|
{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }
|
|
};
|
|
|
|
struct {
|
|
wmsum_t cache_count;
|
|
wmsum_t cache_total_evicts;
|
|
wmsum_t cache_levels[DN_MAX_LEVELS];
|
|
wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
|
|
wmsum_t hash_hits;
|
|
wmsum_t hash_misses;
|
|
wmsum_t hash_collisions;
|
|
wmsum_t hash_chains;
|
|
wmsum_t hash_insert_race;
|
|
wmsum_t metadata_cache_count;
|
|
wmsum_t metadata_cache_overflow;
|
|
} dbuf_sums;
|
|
|
|
#define DBUF_STAT_INCR(stat, val) \
|
|
wmsum_add(&dbuf_sums.stat, val)
|
|
#define DBUF_STAT_DECR(stat, val) \
|
|
DBUF_STAT_INCR(stat, -(val))
|
|
#define DBUF_STAT_BUMP(stat) \
|
|
DBUF_STAT_INCR(stat, 1)
|
|
#define DBUF_STAT_BUMPDOWN(stat) \
|
|
DBUF_STAT_INCR(stat, -1)
|
|
#define DBUF_STAT_MAX(stat, v) { \
|
|
uint64_t _m; \
|
|
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
|
|
(_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
|
|
continue; \
|
|
}
|
|
|
|
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
|
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
|
|
|
|
/*
|
|
* Global data structures and functions for the dbuf cache.
|
|
*/
|
|
static kmem_cache_t *dbuf_kmem_cache;
|
|
static taskq_t *dbu_evict_taskq;
|
|
|
|
static kthread_t *dbuf_cache_evict_thread;
|
|
static kmutex_t dbuf_evict_lock;
|
|
static kcondvar_t dbuf_evict_cv;
|
|
static boolean_t dbuf_evict_thread_exit;
|
|
|
|
/*
|
|
* There are two dbuf caches; each dbuf can only be in one of them at a time.
|
|
*
|
|
* 1. Cache of metadata dbufs, to help make read-heavy administrative commands
|
|
* from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
|
|
* that represent the metadata that describes filesystems/snapshots/
|
|
* bookmarks/properties/etc. We only evict from this cache when we export a
|
|
* pool, to short-circuit as much I/O as possible for all administrative
|
|
* commands that need the metadata. There is no eviction policy for this
|
|
* cache, because we try to only include types in it which would occupy a
|
|
* very small amount of space per object but create a large impact on the
|
|
* performance of these commands. Instead, after it reaches a maximum size
|
|
* (which should only happen on very small memory systems with a very large
|
|
* number of filesystem objects), we stop taking new dbufs into the
|
|
* metadata cache, instead putting them in the normal dbuf cache.
|
|
*
|
|
* 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
|
|
* are not currently held but have been recently released. These dbufs
|
|
* are not eligible for arc eviction until they are aged out of the cache.
|
|
* Dbufs that are aged out of the cache will be immediately destroyed and
|
|
* become eligible for arc eviction.
|
|
*
|
|
* Dbufs are added to these caches once the last hold is released. If a dbuf is
|
|
* later accessed and still exists in the dbuf cache, then it will be removed
|
|
* from the cache and later re-added to the head of the cache.
|
|
*
|
|
* If a given dbuf meets the requirements for the metadata cache, it will go
|
|
* there, otherwise it will be considered for the generic LRU dbuf cache. The
|
|
* caches and the refcounts tracking their sizes are stored in an array indexed
|
|
* by those caches' matching enum values (from dbuf_cached_state_t).
|
|
*/
|
|
typedef struct dbuf_cache {
|
|
multilist_t cache;
|
|
zfs_refcount_t size ____cacheline_aligned;
|
|
} dbuf_cache_t;
|
|
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
|
|
|
|
/* Size limits for the caches */
|
|
static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
|
|
static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
|
|
|
|
/* Set the default sizes of the caches to log2 fraction of arc size */
|
|
static uint_t dbuf_cache_shift = 5;
|
|
static uint_t dbuf_metadata_cache_shift = 6;
|
|
|
|
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
|
|
static uint_t dbuf_mutex_cache_shift = 0;
|
|
|
|
static unsigned long dbuf_cache_target_bytes(void);
|
|
static unsigned long dbuf_metadata_cache_target_bytes(void);
|
|
|
|
/*
|
|
* The LRU dbuf cache uses a three-stage eviction policy:
|
|
* - A low water marker designates when the dbuf eviction thread
|
|
* should stop evicting from the dbuf cache.
|
|
* - When we reach the maximum size (aka mid water mark), we
|
|
* signal the eviction thread to run.
|
|
* - The high water mark indicates when the eviction thread
|
|
* is unable to keep up with the incoming load and eviction must
|
|
* happen in the context of the calling thread.
|
|
*
|
|
* The dbuf cache:
|
|
* (max size)
|
|
* low water mid water hi water
|
|
* +----------------------------------------+----------+----------+
|
|
* | | | |
|
|
* | | | |
|
|
* | | | |
|
|
* | | | |
|
|
* +----------------------------------------+----------+----------+
|
|
* stop signal evict
|
|
* evicting eviction directly
|
|
* thread
|
|
*
|
|
* The high and low water marks indicate the operating range for the eviction
|
|
* thread. The low water mark is, by default, 90% of the total size of the
|
|
* cache and the high water mark is at 110% (both of these percentages can be
|
|
* changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
|
|
* respectively). The eviction thread will try to ensure that the cache remains
|
|
* within this range by waking up every second and checking if the cache is
|
|
* above the low water mark. The thread can also be woken up by callers adding
|
|
* elements into the cache if the cache is larger than the mid water (i.e max
|
|
* cache size). Once the eviction thread is woken up and eviction is required,
|
|
* it will continue evicting buffers until it's able to reduce the cache size
|
|
* to the low water mark. If the cache size continues to grow and hits the high
|
|
* water mark, then callers adding elements to the cache will begin to evict
|
|
* directly from the cache until the cache is no longer above the high water
|
|
* mark.
|
|
*/
|
|
|
|
/*
|
|
* The percentage above and below the maximum cache size.
|
|
*/
|
|
static uint_t dbuf_cache_hiwater_pct = 10;
|
|
static uint_t dbuf_cache_lowater_pct = 10;
|
|
|
|
static int
|
|
dbuf_cons(void *vdb, void *unused, int kmflag)
|
|
{
|
|
(void) unused, (void) kmflag;
|
|
dmu_buf_impl_t *db = vdb;
|
|
memset(db, 0, sizeof (dmu_buf_impl_t));
|
|
|
|
mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
|
|
rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);
|
|
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
|
|
multilist_link_init(&db->db_cache_link);
|
|
zfs_refcount_create(&db->db_holds);
|
|
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
dbuf_dest(void *vdb, void *unused)
|
|
{
|
|
(void) unused;
|
|
dmu_buf_impl_t *db = vdb;
|
|
mutex_destroy(&db->db_mtx);
|
|
rw_destroy(&db->db_rwlock);
|
|
cv_destroy(&db->db_changed);
|
|
ASSERT(!multilist_link_active(&db->db_cache_link));
|
|
zfs_refcount_destroy(&db->db_holds);
|
|
}
|
|
|
|
/*
|
|
* dbuf hash table routines
|
|
*/
|
|
static dbuf_hash_table_t dbuf_hash_table;
|
|
|
|
/*
|
|
* We use Cityhash for this. It's fast, and has good hash properties without
|
|
* requiring any large static buffers.
|
|
*/
|
|
static uint64_t
|
|
dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
|
|
{
|
|
return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
|
|
}
|
|
|
|
#define DTRACE_SET_STATE(db, why) \
|
|
DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \
|
|
const char *, why)
|
|
|
|
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
|
|
((dbuf)->db.db_object == (obj) && \
|
|
(dbuf)->db_objset == (os) && \
|
|
(dbuf)->db_level == (level) && \
|
|
(dbuf)->db_blkid == (blkid))
|
|
|
|
dmu_buf_impl_t *
|
|
dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
|
|
uint64_t *hash_out)
|
|
{
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
uint64_t hv;
|
|
uint64_t idx;
|
|
dmu_buf_impl_t *db;
|
|
|
|
hv = dbuf_hash(os, obj, level, blkid);
|
|
idx = hv & h->hash_table_mask;
|
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
|
|
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
|
|
mutex_enter(&db->db_mtx);
|
|
if (db->db_state != DB_EVICTING) {
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
return (db);
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
}
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
if (hash_out != NULL)
|
|
*hash_out = hv;
|
|
return (NULL);
|
|
}
|
|
|
|
static dmu_buf_impl_t *
|
|
dbuf_find_bonus(objset_t *os, uint64_t object)
|
|
{
|
|
dnode_t *dn;
|
|
dmu_buf_impl_t *db = NULL;
|
|
|
|
if (dnode_hold(os, object, FTAG, &dn) == 0) {
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
if (dn->dn_bonus != NULL) {
|
|
db = dn->dn_bonus;
|
|
mutex_enter(&db->db_mtx);
|
|
}
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
dnode_rele(dn, FTAG);
|
|
}
|
|
return (db);
|
|
}
|
|
|
|
/*
|
|
* Insert an entry into the hash table. If there is already an element
|
|
* equal to elem in the hash table, then the already existing element
|
|
* will be returned and the new element will not be inserted.
|
|
* Otherwise returns NULL.
|
|
*/
|
|
static dmu_buf_impl_t *
|
|
dbuf_hash_insert(dmu_buf_impl_t *db)
|
|
{
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
objset_t *os = db->db_objset;
|
|
uint64_t obj = db->db.db_object;
|
|
int level = db->db_level;
|
|
uint64_t blkid, idx;
|
|
dmu_buf_impl_t *dbf;
|
|
uint32_t i;
|
|
|
|
blkid = db->db_blkid;
|
|
ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
|
|
idx = db->db_hash & h->hash_table_mask;
|
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
|
|
dbf = dbf->db_hash_next, i++) {
|
|
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
|
|
mutex_enter(&dbf->db_mtx);
|
|
if (dbf->db_state != DB_EVICTING) {
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
return (dbf);
|
|
}
|
|
mutex_exit(&dbf->db_mtx);
|
|
}
|
|
}
|
|
|
|
if (i > 0) {
|
|
DBUF_STAT_BUMP(hash_collisions);
|
|
if (i == 1)
|
|
DBUF_STAT_BUMP(hash_chains);
|
|
|
|
DBUF_STAT_MAX(hash_chain_max, i);
|
|
}
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
db->db_hash_next = h->hash_table[idx];
|
|
h->hash_table[idx] = db;
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
|
|
DBUF_STAT_MAX(hash_elements_max, he);
|
|
|
|
return (NULL);
|
|
}
|
|
|
|
/*
|
|
* This returns whether this dbuf should be stored in the metadata cache, which
|
|
* is based on whether it's from one of the dnode types that store data related
|
|
* to traversing dataset hierarchies.
|
|
*/
|
|
static boolean_t
|
|
dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
|
|
{
|
|
DB_DNODE_ENTER(db);
|
|
dmu_object_type_t type = DB_DNODE(db)->dn_type;
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/* Check if this dbuf is one of the types we care about */
|
|
if (DMU_OT_IS_METADATA_CACHED(type)) {
|
|
/* If we hit this, then we set something up wrong in dmu_ot */
|
|
ASSERT(DMU_OT_IS_METADATA(type));
|
|
|
|
/*
|
|
* Sanity check for small-memory systems: don't allocate too
|
|
* much memory for this purpose.
|
|
*/
|
|
if (zfs_refcount_count(
|
|
&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
|
|
dbuf_metadata_cache_target_bytes()) {
|
|
DBUF_STAT_BUMP(metadata_cache_overflow);
|
|
return (B_FALSE);
|
|
}
|
|
|
|
return (B_TRUE);
|
|
}
|
|
|
|
return (B_FALSE);
|
|
}
|
|
|
|
/*
|
|
* Remove an entry from the hash table. It must be in the EVICTING state.
|
|
*/
|
|
static void
|
|
dbuf_hash_remove(dmu_buf_impl_t *db)
|
|
{
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
uint64_t idx;
|
|
dmu_buf_impl_t *dbf, **dbp;
|
|
|
|
ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
|
|
db->db_blkid), ==, db->db_hash);
|
|
idx = db->db_hash & h->hash_table_mask;
|
|
|
|
/*
|
|
* We mustn't hold db_mtx to maintain lock ordering:
|
|
* DBUF_HASH_MUTEX > db_mtx.
|
|
*/
|
|
ASSERT(zfs_refcount_is_zero(&db->db_holds));
|
|
ASSERT(db->db_state == DB_EVICTING);
|
|
ASSERT(!MUTEX_HELD(&db->db_mtx));
|
|
|
|
mutex_enter(DBUF_HASH_MUTEX(h, idx));
|
|
dbp = &h->hash_table[idx];
|
|
while ((dbf = *dbp) != db) {
|
|
dbp = &dbf->db_hash_next;
|
|
ASSERT(dbf != NULL);
|
|
}
|
|
*dbp = db->db_hash_next;
|
|
db->db_hash_next = NULL;
|
|
if (h->hash_table[idx] &&
|
|
h->hash_table[idx]->db_hash_next == NULL)
|
|
DBUF_STAT_BUMPDOWN(hash_chains);
|
|
mutex_exit(DBUF_HASH_MUTEX(h, idx));
|
|
atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
|
|
}
|
|
|
|
typedef enum {
|
|
DBVU_EVICTING,
|
|
DBVU_NOT_EVICTING
|
|
} dbvu_verify_type_t;
|
|
|
|
static void
|
|
dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
|
|
{
|
|
#ifdef ZFS_DEBUG
|
|
int64_t holds;
|
|
|
|
if (db->db_user == NULL)
|
|
return;
|
|
|
|
/* Only data blocks support the attachment of user data. */
|
|
ASSERT(db->db_level == 0);
|
|
|
|
/* Clients must resolve a dbuf before attaching user data. */
|
|
ASSERT(db->db.db_data != NULL);
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
|
|
holds = zfs_refcount_count(&db->db_holds);
|
|
if (verify_type == DBVU_EVICTING) {
|
|
/*
|
|
* Immediate eviction occurs when holds == dirtycnt.
|
|
* For normal eviction buffers, holds is zero on
|
|
* eviction, except when dbuf_fix_old_data() calls
|
|
* dbuf_clear_data(). However, the hold count can grow
|
|
* during eviction even though db_mtx is held (see
|
|
* dmu_bonus_hold() for an example), so we can only
|
|
* test the generic invariant that holds >= dirtycnt.
|
|
*/
|
|
ASSERT3U(holds, >=, db->db_dirtycnt);
|
|
} else {
|
|
if (db->db_user_immediate_evict == TRUE)
|
|
ASSERT3U(holds, >=, db->db_dirtycnt);
|
|
else
|
|
ASSERT3U(holds, >, 0);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void
|
|
dbuf_evict_user(dmu_buf_impl_t *db)
|
|
{
|
|
dmu_buf_user_t *dbu = db->db_user;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (dbu == NULL)
|
|
return;
|
|
|
|
dbuf_verify_user(db, DBVU_EVICTING);
|
|
db->db_user = NULL;
|
|
|
|
#ifdef ZFS_DEBUG
|
|
if (dbu->dbu_clear_on_evict_dbufp != NULL)
|
|
*dbu->dbu_clear_on_evict_dbufp = NULL;
|
|
#endif
|
|
|
|
if (db->db_caching_status != DB_NO_CACHE) {
|
|
/*
|
|
* This is a cached dbuf, so the size of the user data is
|
|
* included in its cached amount. We adjust it here because the
|
|
* user data has already been detached from the dbuf, and the
|
|
* sync functions are not supposed to touch it (the dbuf might
|
|
* not exist anymore by the time the sync functions run.
|
|
*/
|
|
uint64_t size = dbu->dbu_size;
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[db->db_caching_status].size, size, dbu);
|
|
if (db->db_caching_status == DB_DBUF_CACHE)
|
|
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
|
|
}
|
|
|
|
/*
|
|
* There are two eviction callbacks - one that we call synchronously
|
|
* and one that we invoke via a taskq. The async one is useful for
|
|
* avoiding lock order reversals and limiting stack depth.
|
|
*
|
|
* Note that if we have a sync callback but no async callback,
|
|
* it's likely that the sync callback will free the structure
|
|
* containing the dbu. In that case we need to take care to not
|
|
* dereference dbu after calling the sync evict func.
|
|
*/
|
|
boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
|
|
|
|
if (dbu->dbu_evict_func_sync != NULL)
|
|
dbu->dbu_evict_func_sync(dbu);
|
|
|
|
if (has_async) {
|
|
taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
|
|
dbu, 0, &dbu->dbu_tqent);
|
|
}
|
|
}
|
|
|
|
boolean_t
|
|
dbuf_is_metadata(dmu_buf_impl_t *db)
|
|
{
|
|
/*
|
|
* Consider indirect blocks and spill blocks to be meta data.
|
|
*/
|
|
if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
|
|
return (B_TRUE);
|
|
} else {
|
|
boolean_t is_metadata;
|
|
|
|
DB_DNODE_ENTER(db);
|
|
is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
|
|
DB_DNODE_EXIT(db);
|
|
|
|
return (is_metadata);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We want to exclude buffers that are on a special allocation class from
|
|
* L2ARC.
|
|
*/
|
|
boolean_t
|
|
dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
|
|
{
|
|
if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
|
|
(db->db_objset->os_secondary_cache ==
|
|
ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
|
|
if (l2arc_exclude_special == 0)
|
|
return (B_TRUE);
|
|
|
|
/*
|
|
* bp must be checked in the event it was passed from
|
|
* dbuf_read_impl() as the result of a the BP being set from
|
|
* a Direct I/O write in dbuf_read(). See comments in
|
|
* dbuf_read().
|
|
*/
|
|
blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
|
|
|
|
if (db_bp == NULL || BP_IS_HOLE(db_bp))
|
|
return (B_FALSE);
|
|
uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
|
|
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
|
|
vdev_t *vd = NULL;
|
|
|
|
if (vdev < rvd->vdev_children)
|
|
vd = rvd->vdev_child[vdev];
|
|
|
|
if (vd == NULL)
|
|
return (B_TRUE);
|
|
|
|
if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
|
|
vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
|
|
return (B_TRUE);
|
|
}
|
|
return (B_FALSE);
|
|
}
|
|
|
|
static inline boolean_t
|
|
dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
|
|
{
|
|
if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
|
|
(dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
|
|
(level > 0 ||
|
|
DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
|
|
if (l2arc_exclude_special == 0)
|
|
return (B_TRUE);
|
|
|
|
if (bp == NULL || BP_IS_HOLE(bp))
|
|
return (B_FALSE);
|
|
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
|
|
vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
|
|
vdev_t *vd = NULL;
|
|
|
|
if (vdev < rvd->vdev_children)
|
|
vd = rvd->vdev_child[vdev];
|
|
|
|
if (vd == NULL)
|
|
return (B_TRUE);
|
|
|
|
if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
|
|
vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
|
|
return (B_TRUE);
|
|
}
|
|
return (B_FALSE);
|
|
}
|
|
|
|
|
|
/*
|
|
* This function *must* return indices evenly distributed between all
|
|
* sublists of the multilist. This is needed due to how the dbuf eviction
|
|
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
|
|
* distributed between all sublists and uses this assumption when
|
|
* deciding which sublist to evict from and how much to evict from it.
|
|
*/
|
|
static unsigned int
|
|
dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
|
|
{
|
|
dmu_buf_impl_t *db = obj;
|
|
|
|
/*
|
|
* The assumption here, is the hash value for a given
|
|
* dmu_buf_impl_t will remain constant throughout it's lifetime
|
|
* (i.e. it's objset, object, level and blkid fields don't change).
|
|
* Thus, we don't need to store the dbuf's sublist index
|
|
* on insertion, as this index can be recalculated on removal.
|
|
*
|
|
* Also, the low order bits of the hash value are thought to be
|
|
* distributed evenly. Otherwise, in the case that the multilist
|
|
* has a power of two number of sublists, each sublists' usage
|
|
* would not be evenly distributed. In this context full 64bit
|
|
* division would be a waste of time, so limit it to 32 bits.
|
|
*/
|
|
return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
|
|
db->db_level, db->db_blkid) %
|
|
multilist_get_num_sublists(ml));
|
|
}
|
|
|
|
/*
|
|
* The target size of the dbuf cache can grow with the ARC target,
|
|
* unless limited by the tunable dbuf_cache_max_bytes.
|
|
*/
|
|
static inline unsigned long
|
|
dbuf_cache_target_bytes(void)
|
|
{
|
|
return (MIN(dbuf_cache_max_bytes,
|
|
arc_target_bytes() >> dbuf_cache_shift));
|
|
}
|
|
|
|
/*
|
|
* The target size of the dbuf metadata cache can grow with the ARC target,
|
|
* unless limited by the tunable dbuf_metadata_cache_max_bytes.
|
|
*/
|
|
static inline unsigned long
|
|
dbuf_metadata_cache_target_bytes(void)
|
|
{
|
|
return (MIN(dbuf_metadata_cache_max_bytes,
|
|
arc_target_bytes() >> dbuf_metadata_cache_shift));
|
|
}
|
|
|
|
static inline uint64_t
|
|
dbuf_cache_hiwater_bytes(void)
|
|
{
|
|
uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
|
|
return (dbuf_cache_target +
|
|
(dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
|
|
}
|
|
|
|
static inline uint64_t
|
|
dbuf_cache_lowater_bytes(void)
|
|
{
|
|
uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
|
|
return (dbuf_cache_target -
|
|
(dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
|
|
}
|
|
|
|
static inline boolean_t
|
|
dbuf_cache_above_lowater(void)
|
|
{
|
|
return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
|
|
dbuf_cache_lowater_bytes());
|
|
}
|
|
|
|
/*
|
|
* Evict the oldest eligible dbuf from the dbuf cache.
|
|
*/
|
|
static void
|
|
dbuf_evict_one(void)
|
|
{
|
|
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
|
|
multilist_sublist_t *mls = multilist_sublist_lock_idx(
|
|
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
|
|
|
|
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
|
|
|
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
|
|
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
|
|
db = multilist_sublist_prev(mls, db);
|
|
}
|
|
|
|
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
|
|
multilist_sublist_t *, mls);
|
|
|
|
if (db != NULL) {
|
|
multilist_sublist_remove(mls, db);
|
|
multilist_sublist_unlock(mls);
|
|
uint64_t size = db->db.db_size;
|
|
uint64_t usize = dmu_buf_user_size(&db->db);
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);
|
|
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
|
|
DBUF_STAT_BUMPDOWN(cache_count);
|
|
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);
|
|
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
|
|
db->db_caching_status = DB_NO_CACHE;
|
|
dbuf_destroy(db);
|
|
DBUF_STAT_BUMP(cache_total_evicts);
|
|
} else {
|
|
multilist_sublist_unlock(mls);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The dbuf evict thread is responsible for aging out dbufs from the
|
|
* cache. Once the cache has reached it's maximum size, dbufs are removed
|
|
* and destroyed. The eviction thread will continue running until the size
|
|
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
|
|
* out of the cache it is destroyed and becomes eligible for arc eviction.
|
|
*/
|
|
static __attribute__((noreturn)) void
|
|
dbuf_evict_thread(void *unused)
|
|
{
|
|
(void) unused;
|
|
callb_cpr_t cpr;
|
|
|
|
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
|
|
|
|
mutex_enter(&dbuf_evict_lock);
|
|
while (!dbuf_evict_thread_exit) {
|
|
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
|
|
CALLB_CPR_SAFE_BEGIN(&cpr);
|
|
(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
|
|
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
|
|
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
|
|
}
|
|
mutex_exit(&dbuf_evict_lock);
|
|
|
|
/*
|
|
* Keep evicting as long as we're above the low water mark
|
|
* for the cache. We do this without holding the locks to
|
|
* minimize lock contention.
|
|
*/
|
|
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
|
|
dbuf_evict_one();
|
|
}
|
|
|
|
mutex_enter(&dbuf_evict_lock);
|
|
}
|
|
|
|
dbuf_evict_thread_exit = B_FALSE;
|
|
cv_broadcast(&dbuf_evict_cv);
|
|
CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
|
|
thread_exit();
|
|
}
|
|
|
|
/*
|
|
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
|
|
* If the dbuf cache is at its high water mark, then evict a dbuf from the
|
|
* dbuf cache using the caller's context.
|
|
*/
|
|
static void
|
|
dbuf_evict_notify(uint64_t size)
|
|
{
|
|
/*
|
|
* We check if we should evict without holding the dbuf_evict_lock,
|
|
* because it's OK to occasionally make the wrong decision here,
|
|
* and grabbing the lock results in massive lock contention.
|
|
*/
|
|
if (size > dbuf_cache_target_bytes()) {
|
|
if (size > dbuf_cache_hiwater_bytes())
|
|
dbuf_evict_one();
|
|
cv_signal(&dbuf_evict_cv);
|
|
}
|
|
}
|
|
|
|
static int
|
|
dbuf_kstat_update(kstat_t *ksp, int rw)
|
|
{
|
|
dbuf_stats_t *ds = ksp->ks_data;
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
if (rw == KSTAT_WRITE)
|
|
return (SET_ERROR(EACCES));
|
|
|
|
ds->cache_count.value.ui64 =
|
|
wmsum_value(&dbuf_sums.cache_count);
|
|
ds->cache_size_bytes.value.ui64 =
|
|
zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
|
|
ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
|
|
ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
|
|
ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
|
|
ds->cache_total_evicts.value.ui64 =
|
|
wmsum_value(&dbuf_sums.cache_total_evicts);
|
|
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
|
ds->cache_levels[i].value.ui64 =
|
|
wmsum_value(&dbuf_sums.cache_levels[i]);
|
|
ds->cache_levels_bytes[i].value.ui64 =
|
|
wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
|
|
}
|
|
ds->hash_hits.value.ui64 =
|
|
wmsum_value(&dbuf_sums.hash_hits);
|
|
ds->hash_misses.value.ui64 =
|
|
wmsum_value(&dbuf_sums.hash_misses);
|
|
ds->hash_collisions.value.ui64 =
|
|
wmsum_value(&dbuf_sums.hash_collisions);
|
|
ds->hash_chains.value.ui64 =
|
|
wmsum_value(&dbuf_sums.hash_chains);
|
|
ds->hash_insert_race.value.ui64 =
|
|
wmsum_value(&dbuf_sums.hash_insert_race);
|
|
ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
|
|
ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
|
|
ds->metadata_cache_count.value.ui64 =
|
|
wmsum_value(&dbuf_sums.metadata_cache_count);
|
|
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
|
|
&dbuf_caches[DB_DBUF_METADATA_CACHE].size);
|
|
ds->metadata_cache_overflow.value.ui64 =
|
|
wmsum_value(&dbuf_sums.metadata_cache_overflow);
|
|
return (0);
|
|
}
|
|
|
|
void
|
|
dbuf_init(void)
|
|
{
|
|
uint64_t hmsize, hsize = 1ULL << 16;
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
/*
|
|
* The hash table is big enough to fill one eighth of physical memory
|
|
* with an average block size of zfs_arc_average_blocksize (default 8K).
|
|
* By default, the table will take up
|
|
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
|
|
*/
|
|
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
|
|
hsize <<= 1;
|
|
|
|
h->hash_table = NULL;
|
|
while (h->hash_table == NULL) {
|
|
h->hash_table_mask = hsize - 1;
|
|
|
|
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
|
|
if (h->hash_table == NULL)
|
|
hsize >>= 1;
|
|
|
|
ASSERT3U(hsize, >=, 1ULL << 10);
|
|
}
|
|
|
|
/*
|
|
* The hash table buckets are protected by an array of mutexes where
|
|
* each mutex is reponsible for protecting 128 buckets. A minimum
|
|
* array size of 8192 is targeted to avoid contention.
|
|
*/
|
|
if (dbuf_mutex_cache_shift == 0)
|
|
hmsize = MAX(hsize >> 7, 1ULL << 13);
|
|
else
|
|
hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
|
|
|
|
h->hash_mutexes = NULL;
|
|
while (h->hash_mutexes == NULL) {
|
|
h->hash_mutex_mask = hmsize - 1;
|
|
|
|
h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
|
|
KM_SLEEP);
|
|
if (h->hash_mutexes == NULL)
|
|
hmsize >>= 1;
|
|
}
|
|
|
|
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
|
|
sizeof (dmu_buf_impl_t),
|
|
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
|
|
|
|
for (int i = 0; i < hmsize; i++)
|
|
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
|
|
|
|
dbuf_stats_init(h);
|
|
|
|
/*
|
|
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
|
|
* configuration is not required.
|
|
*/
|
|
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
|
|
|
|
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
|
|
multilist_create(&dbuf_caches[dcs].cache,
|
|
sizeof (dmu_buf_impl_t),
|
|
offsetof(dmu_buf_impl_t, db_cache_link),
|
|
dbuf_cache_multilist_index_func);
|
|
zfs_refcount_create(&dbuf_caches[dcs].size);
|
|
}
|
|
|
|
dbuf_evict_thread_exit = B_FALSE;
|
|
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
|
|
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
|
|
NULL, 0, &p0, TS_RUN, minclsyspri);
|
|
|
|
wmsum_init(&dbuf_sums.cache_count, 0);
|
|
wmsum_init(&dbuf_sums.cache_total_evicts, 0);
|
|
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
|
wmsum_init(&dbuf_sums.cache_levels[i], 0);
|
|
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
|
|
}
|
|
wmsum_init(&dbuf_sums.hash_hits, 0);
|
|
wmsum_init(&dbuf_sums.hash_misses, 0);
|
|
wmsum_init(&dbuf_sums.hash_collisions, 0);
|
|
wmsum_init(&dbuf_sums.hash_chains, 0);
|
|
wmsum_init(&dbuf_sums.hash_insert_race, 0);
|
|
wmsum_init(&dbuf_sums.metadata_cache_count, 0);
|
|
wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
|
|
|
|
dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
|
|
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
|
|
KSTAT_FLAG_VIRTUAL);
|
|
if (dbuf_ksp != NULL) {
|
|
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
|
snprintf(dbuf_stats.cache_levels[i].name,
|
|
KSTAT_STRLEN, "cache_level_%d", i);
|
|
dbuf_stats.cache_levels[i].data_type =
|
|
KSTAT_DATA_UINT64;
|
|
snprintf(dbuf_stats.cache_levels_bytes[i].name,
|
|
KSTAT_STRLEN, "cache_level_%d_bytes", i);
|
|
dbuf_stats.cache_levels_bytes[i].data_type =
|
|
KSTAT_DATA_UINT64;
|
|
}
|
|
dbuf_ksp->ks_data = &dbuf_stats;
|
|
dbuf_ksp->ks_update = dbuf_kstat_update;
|
|
kstat_install(dbuf_ksp);
|
|
}
|
|
}
|
|
|
|
void
|
|
dbuf_fini(void)
|
|
{
|
|
dbuf_hash_table_t *h = &dbuf_hash_table;
|
|
|
|
dbuf_stats_destroy();
|
|
|
|
for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
|
|
mutex_destroy(&h->hash_mutexes[i]);
|
|
|
|
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
|
|
vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
|
|
sizeof (kmutex_t));
|
|
|
|
kmem_cache_destroy(dbuf_kmem_cache);
|
|
taskq_destroy(dbu_evict_taskq);
|
|
|
|
mutex_enter(&dbuf_evict_lock);
|
|
dbuf_evict_thread_exit = B_TRUE;
|
|
while (dbuf_evict_thread_exit) {
|
|
cv_signal(&dbuf_evict_cv);
|
|
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
|
|
}
|
|
mutex_exit(&dbuf_evict_lock);
|
|
|
|
mutex_destroy(&dbuf_evict_lock);
|
|
cv_destroy(&dbuf_evict_cv);
|
|
|
|
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
|
|
zfs_refcount_destroy(&dbuf_caches[dcs].size);
|
|
multilist_destroy(&dbuf_caches[dcs].cache);
|
|
}
|
|
|
|
if (dbuf_ksp != NULL) {
|
|
kstat_delete(dbuf_ksp);
|
|
dbuf_ksp = NULL;
|
|
}
|
|
|
|
wmsum_fini(&dbuf_sums.cache_count);
|
|
wmsum_fini(&dbuf_sums.cache_total_evicts);
|
|
for (int i = 0; i < DN_MAX_LEVELS; i++) {
|
|
wmsum_fini(&dbuf_sums.cache_levels[i]);
|
|
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
|
|
}
|
|
wmsum_fini(&dbuf_sums.hash_hits);
|
|
wmsum_fini(&dbuf_sums.hash_misses);
|
|
wmsum_fini(&dbuf_sums.hash_collisions);
|
|
wmsum_fini(&dbuf_sums.hash_chains);
|
|
wmsum_fini(&dbuf_sums.hash_insert_race);
|
|
wmsum_fini(&dbuf_sums.metadata_cache_count);
|
|
wmsum_fini(&dbuf_sums.metadata_cache_overflow);
|
|
}
|
|
|
|
/*
|
|
* Other stuff.
|
|
*/
|
|
|
|
#ifdef ZFS_DEBUG
|
|
static void
|
|
dbuf_verify(dmu_buf_impl_t *db)
|
|
{
|
|
dnode_t *dn;
|
|
dbuf_dirty_record_t *dr;
|
|
uint32_t txg_prev;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
|
|
return;
|
|
|
|
ASSERT(db->db_objset != NULL);
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
if (dn == NULL) {
|
|
ASSERT(db->db_parent == NULL);
|
|
ASSERT(db->db_blkptr == NULL);
|
|
} else {
|
|
ASSERT3U(db->db.db_object, ==, dn->dn_object);
|
|
ASSERT3P(db->db_objset, ==, dn->dn_objset);
|
|
ASSERT3U(db->db_level, <, dn->dn_nlevels);
|
|
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
|
|
db->db_blkid == DMU_SPILL_BLKID ||
|
|
!avl_is_empty(&dn->dn_dbufs));
|
|
}
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
ASSERT(dn != NULL);
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
|
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
ASSERT(dn != NULL);
|
|
ASSERT0(db->db.db_offset);
|
|
} else {
|
|
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
|
|
}
|
|
|
|
if ((dr = list_head(&db->db_dirty_records)) != NULL) {
|
|
ASSERT(dr->dr_dbuf == db);
|
|
txg_prev = dr->dr_txg;
|
|
for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
|
|
dr = list_next(&db->db_dirty_records, dr)) {
|
|
ASSERT(dr->dr_dbuf == db);
|
|
ASSERT(txg_prev > dr->dr_txg);
|
|
txg_prev = dr->dr_txg;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We can't assert that db_size matches dn_datablksz because it
|
|
* can be momentarily different when another thread is doing
|
|
* dnode_set_blksz().
|
|
*/
|
|
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
|
|
dr = db->db_data_pending;
|
|
/*
|
|
* It should only be modified in syncing context, so
|
|
* make sure we only have one copy of the data.
|
|
*/
|
|
ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
|
|
}
|
|
|
|
/* verify db->db_blkptr */
|
|
if (db->db_blkptr) {
|
|
if (db->db_parent == dn->dn_dbuf) {
|
|
/* db is pointed to by the dnode */
|
|
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
|
|
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
|
|
ASSERT(db->db_parent == NULL);
|
|
else
|
|
ASSERT(db->db_parent != NULL);
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
|
} else {
|
|
/* db is pointed to by an indirect block */
|
|
int epb __maybe_unused = db->db_parent->db.db_size >>
|
|
SPA_BLKPTRSHIFT;
|
|
ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
|
|
ASSERT3U(db->db_parent->db.db_object, ==,
|
|
db->db.db_object);
|
|
/*
|
|
* dnode_grow_indblksz() can make this fail if we don't
|
|
* have the parent's rwlock. XXX indblksz no longer
|
|
* grows. safe to do this now?
|
|
*/
|
|
if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
((blkptr_t *)db->db_parent->db.db_data +
|
|
db->db_blkid % epb));
|
|
}
|
|
}
|
|
}
|
|
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
|
|
(db->db_buf == NULL || db->db_buf->b_data) &&
|
|
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
|
|
db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
|
|
/*
|
|
* If the blkptr isn't set but they have nonzero data,
|
|
* it had better be dirty, otherwise we'll lose that
|
|
* data when we evict this buffer.
|
|
*
|
|
* There is an exception to this rule for indirect blocks; in
|
|
* this case, if the indirect block is a hole, we fill in a few
|
|
* fields on each of the child blocks (importantly, birth time)
|
|
* to prevent hole birth times from being lost when you
|
|
* partially fill in a hole.
|
|
*/
|
|
if (db->db_dirtycnt == 0) {
|
|
if (db->db_level == 0) {
|
|
uint64_t *buf = db->db.db_data;
|
|
int i;
|
|
|
|
for (i = 0; i < db->db.db_size >> 3; i++) {
|
|
ASSERT(buf[i] == 0);
|
|
}
|
|
} else {
|
|
blkptr_t *bps = db->db.db_data;
|
|
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
|
|
db->db.db_size);
|
|
/*
|
|
* We want to verify that all the blkptrs in the
|
|
* indirect block are holes, but we may have
|
|
* automatically set up a few fields for them.
|
|
* We iterate through each blkptr and verify
|
|
* they only have those fields set.
|
|
*/
|
|
for (int i = 0;
|
|
i < db->db.db_size / sizeof (blkptr_t);
|
|
i++) {
|
|
blkptr_t *bp = &bps[i];
|
|
ASSERT(ZIO_CHECKSUM_IS_ZERO(
|
|
&bp->blk_cksum));
|
|
ASSERT(
|
|
DVA_IS_EMPTY(&bp->blk_dva[0]) &&
|
|
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
|
|
DVA_IS_EMPTY(&bp->blk_dva[2]));
|
|
ASSERT0(bp->blk_fill);
|
|
ASSERT0(bp->blk_pad[0]);
|
|
ASSERT0(bp->blk_pad[1]);
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
|
ASSERT(BP_IS_HOLE(bp));
|
|
ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
DB_DNODE_EXIT(db);
|
|
}
|
|
#endif
|
|
|
|
static void
|
|
dbuf_clear_data(dmu_buf_impl_t *db)
|
|
{
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
dbuf_evict_user(db);
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
|
db->db.db_data = NULL;
|
|
if (db->db_state != DB_NOFILL) {
|
|
db->db_state = DB_UNCACHED;
|
|
DTRACE_SET_STATE(db, "clear data");
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
|
|
{
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(buf != NULL);
|
|
|
|
db->db_buf = buf;
|
|
ASSERT(buf->b_data != NULL);
|
|
db->db.db_data = buf->b_data;
|
|
}
|
|
|
|
static arc_buf_t *
|
|
dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
|
|
{
|
|
spa_t *spa = db->db_objset->os_spa;
|
|
|
|
return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
|
|
}
|
|
|
|
/*
|
|
* Loan out an arc_buf for read. Return the loaned arc_buf.
|
|
*/
|
|
arc_buf_t *
|
|
dbuf_loan_arcbuf(dmu_buf_impl_t *db)
|
|
{
|
|
arc_buf_t *abuf;
|
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
mutex_enter(&db->db_mtx);
|
|
if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
|
|
int blksz = db->db.db_size;
|
|
spa_t *spa = db->db_objset->os_spa;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
abuf = arc_loan_buf(spa, B_FALSE, blksz);
|
|
memcpy(abuf->b_data, db->db.db_data, blksz);
|
|
} else {
|
|
abuf = db->db_buf;
|
|
arc_loan_inuse_buf(abuf, db);
|
|
db->db_buf = NULL;
|
|
dbuf_clear_data(db);
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
return (abuf);
|
|
}
|
|
|
|
/*
|
|
* Calculate which level n block references the data at the level 0 offset
|
|
* provided.
|
|
*/
|
|
uint64_t
|
|
dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
|
|
{
|
|
if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
|
|
/*
|
|
* The level n blkid is equal to the level 0 blkid divided by
|
|
* the number of level 0s in a level n block.
|
|
*
|
|
* The level 0 blkid is offset >> datablkshift =
|
|
* offset / 2^datablkshift.
|
|
*
|
|
* The number of level 0s in a level n is the number of block
|
|
* pointers in an indirect block, raised to the power of level.
|
|
* This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
|
|
* 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
|
|
*
|
|
* Thus, the level n blkid is: offset /
|
|
* ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
|
|
* = offset / 2^(datablkshift + level *
|
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
|
* = offset >> (datablkshift + level *
|
|
* (indblkshift - SPA_BLKPTRSHIFT))
|
|
*/
|
|
|
|
const unsigned exp = dn->dn_datablkshift +
|
|
level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
|
|
|
|
if (exp >= 8 * sizeof (offset)) {
|
|
/* This only happens on the highest indirection level */
|
|
ASSERT3U(level, ==, dn->dn_nlevels - 1);
|
|
return (0);
|
|
}
|
|
|
|
ASSERT3U(exp, <, 8 * sizeof (offset));
|
|
|
|
return (offset >> exp);
|
|
} else {
|
|
ASSERT3U(offset, <, dn->dn_datablksz);
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This function is used to lock the parent of the provided dbuf. This should be
|
|
* used when modifying or reading db_blkptr.
|
|
*/
|
|
db_lock_type_t
|
|
dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
|
|
{
|
|
enum db_lock_type ret = DLT_NONE;
|
|
if (db->db_parent != NULL) {
|
|
rw_enter(&db->db_parent->db_rwlock, rw);
|
|
ret = DLT_PARENT;
|
|
} else if (dmu_objset_ds(db->db_objset) != NULL) {
|
|
rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
|
|
tag);
|
|
ret = DLT_OBJSET;
|
|
}
|
|
/*
|
|
* We only return a DLT_NONE lock when it's the top-most indirect block
|
|
* of the meta-dnode of the MOS.
|
|
*/
|
|
return (ret);
|
|
}
|
|
|
|
/*
|
|
* We need to pass the lock type in because it's possible that the block will
|
|
* move from being the topmost indirect block in a dnode (and thus, have no
|
|
* parent) to not the top-most via an indirection increase. This would cause a
|
|
* panic if we didn't pass the lock type in.
|
|
*/
|
|
void
|
|
dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
|
|
{
|
|
if (type == DLT_PARENT)
|
|
rw_exit(&db->db_parent->db_rwlock);
|
|
else if (type == DLT_OBJSET)
|
|
rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
|
|
}
|
|
|
|
static void
|
|
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
|
arc_buf_t *buf, void *vdb)
|
|
{
|
|
(void) zb, (void) bp;
|
|
dmu_buf_impl_t *db = vdb;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
ASSERT3U(db->db_state, ==, DB_READ);
|
|
|
|
/*
|
|
* All reads are synchronous, so we must have a hold on the dbuf
|
|
*/
|
|
ASSERT(zfs_refcount_count(&db->db_holds) > 0);
|
|
ASSERT(db->db_buf == NULL);
|
|
ASSERT(db->db.db_data == NULL);
|
|
if (buf == NULL) {
|
|
/* i/o error */
|
|
ASSERT(zio == NULL || zio->io_error != 0);
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
|
db->db_state = DB_UNCACHED;
|
|
DTRACE_SET_STATE(db, "i/o error");
|
|
} else if (db->db_level == 0 && db->db_freed_in_flight) {
|
|
/* freed in flight */
|
|
ASSERT(zio == NULL || zio->io_error == 0);
|
|
arc_release(buf, db);
|
|
memset(buf->b_data, 0, db->db.db_size);
|
|
arc_buf_freeze(buf);
|
|
db->db_freed_in_flight = FALSE;
|
|
dbuf_set_data(db, buf);
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db, "freed in flight");
|
|
} else {
|
|
/* success */
|
|
ASSERT(zio == NULL || zio->io_error == 0);
|
|
dbuf_set_data(db, buf);
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db, "successful read");
|
|
}
|
|
cv_broadcast(&db->db_changed);
|
|
dbuf_rele_and_unlock(db, NULL, B_FALSE);
|
|
}
|
|
|
|
/*
|
|
* Shortcut for performing reads on bonus dbufs. Returns
|
|
* an error if we fail to verify the dnode associated with
|
|
* a decrypted block. Otherwise success.
|
|
*/
|
|
static int
|
|
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
|
|
{
|
|
int bonuslen, max_bonuslen;
|
|
|
|
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
|
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(DB_DNODE_HELD(db));
|
|
ASSERT3U(bonuslen, <=, db->db.db_size);
|
|
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
|
|
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
|
|
if (bonuslen < max_bonuslen)
|
|
memset(db->db.db_data, 0, max_bonuslen);
|
|
if (bonuslen)
|
|
memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db, "bonus buffer filled");
|
|
return (0);
|
|
}
|
|
|
|
static void
|
|
dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
|
|
{
|
|
blkptr_t *bps = db->db.db_data;
|
|
uint32_t indbs = 1ULL << dn->dn_indblkshift;
|
|
int n_bps = indbs >> SPA_BLKPTRSHIFT;
|
|
|
|
for (int i = 0; i < n_bps; i++) {
|
|
blkptr_t *bp = &bps[i];
|
|
|
|
ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
|
|
BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
|
|
dn->dn_datablksz : BP_GET_LSIZE(dbbp));
|
|
BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
|
|
BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
|
|
BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle reads on dbufs that are holes, if necessary. This function
|
|
* requires that the dbuf's mutex is held. Returns success (0) if action
|
|
* was taken, ENOENT if no action was taken.
|
|
*/
|
|
static int
|
|
dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
|
{
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
int is_hole = bp == NULL || BP_IS_HOLE(bp);
|
|
/*
|
|
* For level 0 blocks only, if the above check fails:
|
|
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
|
|
* processes the delete record and clears the bp while we are waiting
|
|
* for the dn_mtx (resulting in a "no" from block_freed).
|
|
*/
|
|
if (!is_hole && db->db_level == 0)
|
|
is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
|
|
|
|
if (is_hole) {
|
|
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
|
|
memset(db->db.db_data, 0, db->db.db_size);
|
|
|
|
if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
|
|
BP_GET_LOGICAL_BIRTH(bp) != 0) {
|
|
dbuf_handle_indirect_hole(db, dn, bp);
|
|
}
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db, "hole read satisfied");
|
|
return (0);
|
|
}
|
|
return (ENOENT);
|
|
}
|
|
|
|
/*
|
|
* This function ensures that, when doing a decrypting read of a block,
|
|
* we make sure we have decrypted the dnode associated with it. We must do
|
|
* this so that we ensure we are fully authenticating the checksum-of-MACs
|
|
* tree from the root of the objset down to this block. Indirect blocks are
|
|
* always verified against their secure checksum-of-MACs assuming that the
|
|
* dnode containing them is correct. Now that we are doing a decrypting read,
|
|
* we can be sure that the key is loaded and verify that assumption. This is
|
|
* especially important considering that we always read encrypted dnode
|
|
* blocks as raw data (without verifying their MACs) to start, and
|
|
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
|
*/
|
|
static int
|
|
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
|
{
|
|
objset_t *os = db->db_objset;
|
|
dmu_buf_impl_t *dndb;
|
|
arc_buf_t *dnbuf;
|
|
zbookmark_phys_t zb;
|
|
int err;
|
|
|
|
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
|
!os->os_encrypted || os->os_raw_receive ||
|
|
(dndb = dn->dn_dbuf) == NULL)
|
|
return (0);
|
|
|
|
dnbuf = dndb->db_buf;
|
|
if (!arc_is_encrypted(dnbuf))
|
|
return (0);
|
|
|
|
mutex_enter(&dndb->db_mtx);
|
|
|
|
/*
|
|
* Since dnode buffer is modified by sync process, there can be only
|
|
* one copy of it. It means we can not modify (decrypt) it while it
|
|
* is being written. I don't see how this may happen now, since
|
|
* encrypted dnode writes by receive should be completed before any
|
|
* plain-text reads due to txg wait, but better be safe than sorry.
|
|
*/
|
|
while (1) {
|
|
if (!arc_is_encrypted(dnbuf)) {
|
|
mutex_exit(&dndb->db_mtx);
|
|
return (0);
|
|
}
|
|
dbuf_dirty_record_t *dr = dndb->db_data_pending;
|
|
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
|
|
break;
|
|
cv_wait(&dndb->db_changed, &dndb->db_mtx);
|
|
};
|
|
|
|
SET_BOOKMARK(&zb, dmu_objset_id(os),
|
|
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
|
|
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
|
|
|
|
/*
|
|
* An error code of EACCES tells us that the key is still not
|
|
* available. This is ok if we are only reading authenticated
|
|
* (and therefore non-encrypted) blocks.
|
|
*/
|
|
if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
|
|
!DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
|
|
(db->db_blkid == DMU_BONUS_BLKID &&
|
|
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
|
|
err = 0;
|
|
|
|
mutex_exit(&dndb->db_mtx);
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Drops db_mtx and the parent lock specified by dblt and tag before
|
|
* returning.
|
|
*/
|
|
static int
|
|
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
|
|
{
|
|
zbookmark_phys_t zb;
|
|
uint32_t aflags = ARC_FLAG_NOWAIT;
|
|
int err, zio_flags;
|
|
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
|
ASSERT(db->db_buf == NULL);
|
|
ASSERT(db->db_parent == NULL ||
|
|
RW_LOCK_HELD(&db->db_parent->db_rwlock));
|
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
err = dbuf_read_bonus(db, dn);
|
|
goto early_unlock;
|
|
}
|
|
|
|
err = dbuf_read_hole(db, dn, bp);
|
|
if (err == 0)
|
|
goto early_unlock;
|
|
|
|
ASSERT(bp != NULL);
|
|
|
|
/*
|
|
* Any attempt to read a redacted block should result in an error. This
|
|
* will never happen under normal conditions, but can be useful for
|
|
* debugging purposes.
|
|
*/
|
|
if (BP_IS_REDACTED(bp)) {
|
|
ASSERT(dsl_dataset_feature_is_active(
|
|
db->db_objset->os_dsl_dataset,
|
|
SPA_FEATURE_REDACTED_DATASETS));
|
|
err = SET_ERROR(EIO);
|
|
goto early_unlock;
|
|
}
|
|
|
|
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
|
|
/*
|
|
* All bps of an encrypted os should have the encryption bit set.
|
|
* If this is not true it indicates tampering and we report an error.
|
|
*/
|
|
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
|
|
spa_log_error(db->db_objset->os_spa, &zb,
|
|
BP_GET_LOGICAL_BIRTH(bp));
|
|
err = SET_ERROR(EIO);
|
|
goto early_unlock;
|
|
}
|
|
|
|
db->db_state = DB_READ;
|
|
DTRACE_SET_STATE(db, "read issued");
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (!DBUF_IS_CACHEABLE(db))
|
|
aflags |= ARC_FLAG_UNCACHED;
|
|
else if (dbuf_is_l2cacheable(db, bp))
|
|
aflags |= ARC_FLAG_L2CACHE;
|
|
|
|
dbuf_add_ref(db, NULL);
|
|
|
|
zio_flags = (flags & DB_RF_CANFAIL) ?
|
|
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
|
|
|
|
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
|
zio_flags |= ZIO_FLAG_RAW;
|
|
|
|
/*
|
|
* The zio layer will copy the provided blkptr later, but we need to
|
|
* do this now so that we can release the parent's rwlock. We have to
|
|
* do that now so that if dbuf_read_done is called synchronously (on
|
|
* an l1 cache hit) we don't acquire the db_mtx while holding the
|
|
* parent's rwlock, which would be a lock ordering violation.
|
|
*/
|
|
blkptr_t copy = *bp;
|
|
dmu_buf_unlock_parent(db, dblt, tag);
|
|
return (arc_read(zio, db->db_objset->os_spa, ©,
|
|
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
|
|
&aflags, &zb));
|
|
|
|
early_unlock:
|
|
mutex_exit(&db->db_mtx);
|
|
dmu_buf_unlock_parent(db, dblt, tag);
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* This is our just-in-time copy function. It makes a copy of buffers that
|
|
* have been modified in a previous transaction group before we access them in
|
|
* the current active group.
|
|
*
|
|
* This function is used in three places: when we are dirtying a buffer for the
|
|
* first time in a txg, when we are freeing a range in a dnode that includes
|
|
* this buffer, and when we are accessing a buffer which was received compressed
|
|
* and later referenced in a WRITE_BYREF record.
|
|
*
|
|
* Note that when we are called from dbuf_free_range() we do not put a hold on
|
|
* the buffer, we just traverse the active dbuf list for the dnode.
|
|
*/
|
|
static void
|
|
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
|
{
|
|
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(db->db.db_data != NULL);
|
|
ASSERT(db->db_level == 0);
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
|
|
|
if (dr == NULL ||
|
|
(dr->dt.dl.dr_data !=
|
|
((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
|
|
return;
|
|
|
|
/*
|
|
* If the last dirty record for this dbuf has not yet synced
|
|
* and its referencing the dbuf data, either:
|
|
* reset the reference to point to a new copy,
|
|
* or (if there a no active holders)
|
|
* just null out the current db_data pointer.
|
|
*/
|
|
ASSERT3U(dr->dr_txg, >=, txg - 2);
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
dnode_t *dn = DB_DNODE(db);
|
|
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
|
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
|
|
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
|
|
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
|
|
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
|
|
dnode_t *dn = DB_DNODE(db);
|
|
int size = arc_buf_size(db->db_buf);
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
spa_t *spa = db->db_objset->os_spa;
|
|
enum zio_compress compress_type =
|
|
arc_get_compression(db->db_buf);
|
|
uint8_t complevel = arc_get_complevel(db->db_buf);
|
|
|
|
if (arc_is_encrypted(db->db_buf)) {
|
|
boolean_t byteorder;
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
arc_get_raw_params(db->db_buf, &byteorder, salt,
|
|
iv, mac);
|
|
dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
|
|
dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
|
|
mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
|
|
compress_type, complevel);
|
|
} else if (compress_type != ZIO_COMPRESS_OFF) {
|
|
ASSERT3U(type, ==, ARC_BUFC_DATA);
|
|
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
|
|
size, arc_buf_lsize(db->db_buf), compress_type,
|
|
complevel);
|
|
} else {
|
|
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
|
|
}
|
|
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
|
|
} else {
|
|
db->db_buf = NULL;
|
|
dbuf_clear_data(db);
|
|
}
|
|
}
|
|
|
|
int
|
|
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|
{
|
|
dnode_t *dn;
|
|
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
|
int err;
|
|
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
|
|
/*
|
|
* Ensure that this block's dnode has been decrypted if the caller
|
|
* has requested decrypted data.
|
|
*/
|
|
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
|
|
if (err != 0)
|
|
goto done;
|
|
|
|
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
|
(flags & DB_RF_NOPREFETCH) == 0;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
if (flags & DB_RF_PARTIAL_FIRST)
|
|
db->db_partial_read = B_TRUE;
|
|
else if (!(flags & DB_RF_PARTIAL_MORE))
|
|
db->db_partial_read = B_FALSE;
|
|
miss = (db->db_state != DB_CACHED);
|
|
|
|
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
|
/*
|
|
* Another reader came in while the dbuf was in flight between
|
|
* UNCACHED and CACHED. Either a writer will finish filling
|
|
* the buffer, sending the dbuf to CACHED, or the first reader's
|
|
* request will reach the read_done callback and send the dbuf
|
|
* to CACHED. Otherwise, a failure occurred and the dbuf will
|
|
* be sent to UNCACHED.
|
|
*/
|
|
if (flags & DB_RF_NEVERWAIT) {
|
|
mutex_exit(&db->db_mtx);
|
|
DB_DNODE_EXIT(db);
|
|
goto done;
|
|
}
|
|
do {
|
|
ASSERT(db->db_state == DB_READ ||
|
|
(flags & DB_RF_HAVESTRUCT) == 0);
|
|
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
|
|
zio_t *, pio);
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
|
|
if (db->db_state == DB_UNCACHED) {
|
|
err = SET_ERROR(EIO);
|
|
mutex_exit(&db->db_mtx);
|
|
DB_DNODE_EXIT(db);
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
/*
|
|
* If the arc buf is compressed or encrypted and the caller
|
|
* requested uncompressed data, we need to untransform it
|
|
* before returning. We also call arc_untransform() on any
|
|
* unauthenticated blocks, which will verify their MAC if
|
|
* the key is now available.
|
|
*/
|
|
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
|
(arc_is_encrypted(db->db_buf) ||
|
|
arc_is_unauthenticated(db->db_buf) ||
|
|
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
|
spa_t *spa = dn->dn_objset->os_spa;
|
|
zbookmark_phys_t zb;
|
|
|
|
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
dbuf_fix_old_data(db, spa_syncing_txg(spa));
|
|
err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
|
|
dbuf_set_data(db, db->db_buf);
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
} else {
|
|
ASSERT(db->db_state == DB_UNCACHED ||
|
|
db->db_state == DB_NOFILL);
|
|
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
|
blkptr_t *bp;
|
|
|
|
/*
|
|
* If a block clone or Direct I/O write has occurred we will
|
|
* get the dirty records overridden BP so we get the most
|
|
* recent data.
|
|
*/
|
|
err = dmu_buf_get_bp_from_dbuf(db, &bp);
|
|
|
|
if (!err) {
|
|
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
|
(bp != NULL && !BP_IS_HOLE(bp)))) {
|
|
spa_t *spa = dn->dn_objset->os_spa;
|
|
pio =
|
|
zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
|
need_wait = B_TRUE;
|
|
}
|
|
|
|
err =
|
|
dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
|
|
} else {
|
|
mutex_exit(&db->db_mtx);
|
|
dmu_buf_unlock_parent(db, dblt, FTAG);
|
|
}
|
|
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
|
|
miss = (db->db_state != DB_CACHED);
|
|
}
|
|
|
|
if (err == 0 && prefetch) {
|
|
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
|
flags & DB_RF_HAVESTRUCT);
|
|
}
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
* If we created a zio we must execute it to avoid leaking it, even if
|
|
* it isn't attached to any work due to an error in dbuf_read_impl().
|
|
*/
|
|
if (need_wait) {
|
|
if (err == 0)
|
|
err = zio_wait(pio);
|
|
else
|
|
(void) zio_wait(pio);
|
|
pio = NULL;
|
|
}
|
|
|
|
done:
|
|
if (miss)
|
|
DBUF_STAT_BUMP(hash_misses);
|
|
else
|
|
DBUF_STAT_BUMP(hash_hits);
|
|
if (pio && err != 0) {
|
|
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
|
ZIO_FLAG_CANFAIL);
|
|
zio->io_error = err;
|
|
zio_nowait(zio);
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
static void
|
|
dbuf_noread(dmu_buf_impl_t *db)
|
|
{
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
mutex_enter(&db->db_mtx);
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
if (db->db_state == DB_UNCACHED) {
|
|
ASSERT(db->db_buf == NULL);
|
|
ASSERT(db->db.db_data == NULL);
|
|
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
|
|
db->db_state = DB_FILL;
|
|
DTRACE_SET_STATE(db, "assigning filled buffer");
|
|
} else if (db->db_state == DB_NOFILL) {
|
|
dbuf_clear_data(db);
|
|
} else {
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
|
|
void
|
|
dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
|
|
uint64_t txg = dr->dr_txg;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
/*
|
|
* This assert is valid because dmu_sync() expects to be called by
|
|
* a zilog's get_data while holding a range lock. This call only
|
|
* comes from dbuf_dirty() callers who must also hold a range lock.
|
|
*/
|
|
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
|
|
ASSERT(db->db_level == 0);
|
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
|
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
|
|
return;
|
|
|
|
ASSERT(db->db_data_pending != dr);
|
|
|
|
/* free this block */
|
|
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
|
|
zio_free(db->db_objset->os_spa, txg, bp);
|
|
|
|
if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
|
|
ASSERT0P(dr->dt.dl.dr_data);
|
|
dr->dt.dl.dr_data = db->db_buf;
|
|
}
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
|
dr->dt.dl.dr_nopwrite = B_FALSE;
|
|
dr->dt.dl.dr_brtwrite = B_FALSE;
|
|
dr->dt.dl.dr_diowrite = B_FALSE;
|
|
dr->dt.dl.dr_has_raw_params = B_FALSE;
|
|
|
|
/*
|
|
* In the event that Direct I/O was used, we do not
|
|
* need to release the buffer from the ARC.
|
|
*
|
|
* Release the already-written buffer, so we leave it in
|
|
* a consistent dirty state. Note that all callers are
|
|
* modifying the buffer, so they will immediately do
|
|
* another (redundant) arc_release(). Therefore, leave
|
|
* the buf thawed to save the effort of freezing &
|
|
* immediately re-thawing it.
|
|
*/
|
|
if (dr->dt.dl.dr_data)
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
}
|
|
|
|
/*
|
|
* Evict (if its unreferenced) or clear (if its referenced) any level-0
|
|
* data blocks in the free range, so that any future readers will find
|
|
* empty blocks.
|
|
*/
|
|
void
|
|
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
|
|
dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db_search;
|
|
dmu_buf_impl_t *db, *db_next;
|
|
uint64_t txg = tx->tx_txg;
|
|
avl_index_t where;
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
if (end_blkid > dn->dn_maxblkid &&
|
|
!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
|
|
end_blkid = dn->dn_maxblkid;
|
|
dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
|
|
(u_longlong_t)end_blkid);
|
|
|
|
db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
|
|
db_search->db_level = 0;
|
|
db_search->db_blkid = start_blkid;
|
|
db_search->db_state = DB_SEARCH;
|
|
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
db = avl_find(&dn->dn_dbufs, db_search, &where);
|
|
ASSERT3P(db, ==, NULL);
|
|
|
|
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
|
|
|
|
for (; db != NULL; db = db_next) {
|
|
db_next = AVL_NEXT(&dn->dn_dbufs, db);
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
|
|
if (db->db_level != 0 || db->db_blkid > end_blkid) {
|
|
break;
|
|
}
|
|
ASSERT3U(db->db_blkid, >=, start_blkid);
|
|
|
|
/* found a level 0 buffer in the range */
|
|
mutex_enter(&db->db_mtx);
|
|
if (dbuf_undirty(db, tx)) {
|
|
/* mutex has been dropped and dbuf destroyed */
|
|
continue;
|
|
}
|
|
|
|
if (db->db_state == DB_UNCACHED ||
|
|
db->db_state == DB_NOFILL ||
|
|
db->db_state == DB_EVICTING) {
|
|
ASSERT(db->db.db_data == NULL);
|
|
mutex_exit(&db->db_mtx);
|
|
continue;
|
|
}
|
|
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
|
/* will be handled in dbuf_read_done or dbuf_rele */
|
|
db->db_freed_in_flight = TRUE;
|
|
mutex_exit(&db->db_mtx);
|
|
continue;
|
|
}
|
|
if (zfs_refcount_count(&db->db_holds) == 0) {
|
|
ASSERT(db->db_buf);
|
|
dbuf_destroy(db);
|
|
continue;
|
|
}
|
|
/* The dbuf is referenced */
|
|
|
|
dr = list_head(&db->db_dirty_records);
|
|
if (dr != NULL) {
|
|
if (dr->dr_txg == txg) {
|
|
/*
|
|
* This buffer is "in-use", re-adjust the file
|
|
* size to reflect that this buffer may
|
|
* contain new data when we sync.
|
|
*/
|
|
if (db->db_blkid != DMU_SPILL_BLKID &&
|
|
db->db_blkid > dn->dn_maxblkid)
|
|
dn->dn_maxblkid = db->db_blkid;
|
|
dbuf_unoverride(dr);
|
|
} else {
|
|
/*
|
|
* This dbuf is not dirty in the open context.
|
|
* Either uncache it (if its not referenced in
|
|
* the open context) or reset its contents to
|
|
* empty.
|
|
*/
|
|
dbuf_fix_old_data(db, txg);
|
|
}
|
|
}
|
|
/* clear the contents if its cached */
|
|
if (db->db_state == DB_CACHED) {
|
|
ASSERT(db->db.db_data != NULL);
|
|
arc_release(db->db_buf, db);
|
|
rw_enter(&db->db_rwlock, RW_WRITER);
|
|
memset(db->db.db_data, 0, db->db.db_size);
|
|
rw_exit(&db->db_rwlock);
|
|
arc_buf_freeze(db->db_buf);
|
|
}
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
kmem_free(db_search, sizeof (dmu_buf_impl_t));
|
|
}
|
|
|
|
void
|
|
dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
|
{
|
|
arc_buf_t *buf, *old_buf;
|
|
dbuf_dirty_record_t *dr;
|
|
int osize = db->db.db_size;
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
dnode_t *dn;
|
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
|
|
/*
|
|
* XXX we should be doing a dbuf_read, checking the return
|
|
* value and returning that up to our callers
|
|
*/
|
|
dmu_buf_will_dirty(&db->db, tx);
|
|
|
|
VERIFY3P(db->db_buf, !=, NULL);
|
|
|
|
/* create the data buffer for the new block */
|
|
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
|
|
|
|
/* copy old block data to the new block */
|
|
old_buf = db->db_buf;
|
|
memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
|
|
/* zero the remainder */
|
|
if (size > osize)
|
|
memset((uint8_t *)buf->b_data + osize, 0, size - osize);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
dbuf_set_data(db, buf);
|
|
arc_buf_destroy(old_buf, db);
|
|
db->db.db_size = size;
|
|
|
|
dr = list_head(&db->db_dirty_records);
|
|
/* dirty record added by dmu_buf_will_dirty() */
|
|
VERIFY(dr != NULL);
|
|
if (db->db_level == 0)
|
|
dr->dt.dl.dr_data = buf;
|
|
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
|
|
ASSERT3U(dr->dr_accounted, ==, osize);
|
|
dr->dr_accounted = size;
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
|
|
DB_DNODE_EXIT(db);
|
|
}
|
|
|
|
void
|
|
dbuf_release_bp(dmu_buf_impl_t *db)
|
|
{
|
|
objset_t *os __maybe_unused = db->db_objset;
|
|
|
|
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
|
ASSERT(arc_released(os->os_phys_buf) ||
|
|
list_link_active(&os->os_dsl_dataset->ds_synced_link));
|
|
ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
|
|
|
|
(void) arc_release(db->db_buf, db);
|
|
}
|
|
|
|
/*
|
|
* We already have a dirty record for this TXG, and we are being
|
|
* dirtied again.
|
|
*/
|
|
static void
|
|
dbuf_redirty(dbuf_dirty_record_t *dr)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
|
|
/*
|
|
* If this buffer has already been written out,
|
|
* we now need to reset its state.
|
|
*/
|
|
dbuf_unoverride(dr);
|
|
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
|
|
db->db_state != DB_NOFILL) {
|
|
/* Already released on initial dirty, so just thaw. */
|
|
ASSERT(arc_released(db->db_buf));
|
|
arc_buf_thaw(db->db_buf);
|
|
}
|
|
}
|
|
}
|
|
|
|
dbuf_dirty_record_t *
|
|
dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
|
|
{
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
|
|
dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
|
|
ASSERT(dn->dn_maxblkid >= blkid);
|
|
|
|
dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
|
|
list_link_init(&dr->dr_dirty_node);
|
|
list_link_init(&dr->dr_dbuf_node);
|
|
dr->dr_dnode = dn;
|
|
dr->dr_txg = tx->tx_txg;
|
|
dr->dt.dll.dr_blkid = blkid;
|
|
dr->dr_accounted = dn->dn_datablksz;
|
|
|
|
/*
|
|
* There should not be any dbuf for the block that we're dirtying.
|
|
* Otherwise the buffer contents could be inconsistent between the
|
|
* dbuf and the lightweight dirty record.
|
|
*/
|
|
ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
|
|
NULL));
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
int txgoff = tx->tx_txg & TXG_MASK;
|
|
if (dn->dn_free_ranges[txgoff] != NULL) {
|
|
range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
|
|
}
|
|
|
|
if (dn->dn_nlevels == 1) {
|
|
ASSERT3U(blkid, <, dn->dn_nblkptr);
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
mutex_exit(&dn->dn_mtx);
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
dnode_setdirty(dn, tx);
|
|
} else {
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
|
|
1, blkid >> epbs, FTAG);
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
if (parent_db == NULL) {
|
|
kmem_free(dr, sizeof (*dr));
|
|
return (NULL);
|
|
}
|
|
int err = dbuf_read(parent_db, NULL,
|
|
(DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
|
if (err != 0) {
|
|
dbuf_rele(parent_db, FTAG);
|
|
kmem_free(dr, sizeof (*dr));
|
|
return (NULL);
|
|
}
|
|
|
|
dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
|
|
dbuf_rele(parent_db, FTAG);
|
|
mutex_enter(&parent_dr->dt.di.dr_mtx);
|
|
ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
|
|
list_insert_tail(&parent_dr->dt.di.dr_children, dr);
|
|
mutex_exit(&parent_dr->dt.di.dr_mtx);
|
|
dr->dr_parent = parent_dr;
|
|
}
|
|
|
|
dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
|
|
|
|
return (dr);
|
|
}
|
|
|
|
dbuf_dirty_record_t *
|
|
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
{
|
|
dnode_t *dn;
|
|
objset_t *os;
|
|
dbuf_dirty_record_t *dr, *dr_next, *dr_head;
|
|
int txgoff = tx->tx_txg & TXG_MASK;
|
|
boolean_t drop_struct_rwlock = B_FALSE;
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
DMU_TX_DIRTY_BUF(tx, db);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
/*
|
|
* Shouldn't dirty a regular buffer in syncing context. Private
|
|
* objects may be dirtied in syncing context, but only if they
|
|
* were already pre-dirtied in open context.
|
|
*/
|
|
#ifdef ZFS_DEBUG
|
|
if (dn->dn_objset->os_dsl_dataset != NULL) {
|
|
rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
|
|
RW_READER, FTAG);
|
|
}
|
|
ASSERT(!dmu_tx_is_syncing(tx) ||
|
|
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
|
|
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
dn->dn_objset->os_dsl_dataset == NULL);
|
|
if (dn->dn_objset->os_dsl_dataset != NULL)
|
|
rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
|
|
#endif
|
|
/*
|
|
* We make this assert for private objects as well, but after we
|
|
* check if we're already dirty. They are allowed to re-dirty
|
|
* in syncing context.
|
|
*/
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
/*
|
|
* XXX make this true for indirects too? The problem is that
|
|
* transactions created with dmu_tx_create_assigned() from
|
|
* syncing context don't bother holding ahead.
|
|
*/
|
|
ASSERT(db->db_level != 0 ||
|
|
db->db_state == DB_CACHED || db->db_state == DB_FILL ||
|
|
db->db_state == DB_NOFILL);
|
|
|
|
mutex_enter(&dn->dn_mtx);
|
|
dnode_set_dirtyctx(dn, tx, db);
|
|
if (tx->tx_txg > dn->dn_dirty_txg)
|
|
dn->dn_dirty_txg = tx->tx_txg;
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
dn->dn_have_spill = B_TRUE;
|
|
|
|
/*
|
|
* If this buffer is already dirty, we're done.
|
|
*/
|
|
dr_head = list_head(&db->db_dirty_records);
|
|
ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
|
|
db->db.db_object == DMU_META_DNODE_OBJECT);
|
|
dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
|
|
if (dr_next && dr_next->dr_txg == tx->tx_txg) {
|
|
DB_DNODE_EXIT(db);
|
|
|
|
dbuf_redirty(dr_next);
|
|
mutex_exit(&db->db_mtx);
|
|
return (dr_next);
|
|
}
|
|
|
|
/*
|
|
* Only valid if not already dirty.
|
|
*/
|
|
ASSERT(dn->dn_object == 0 ||
|
|
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
|
|
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
|
|
|
|
ASSERT3U(dn->dn_nlevels, >, db->db_level);
|
|
|
|
/*
|
|
* We should only be dirtying in syncing context if it's the
|
|
* mos or we're initializing the os or it's a special object.
|
|
* However, we are allowed to dirty in syncing context provided
|
|
* we already dirtied it in open context. Hence we must make
|
|
* this assertion only if we're not already dirty.
|
|
*/
|
|
os = dn->dn_objset;
|
|
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
|
|
#ifdef ZFS_DEBUG
|
|
if (dn->dn_objset->os_dsl_dataset != NULL)
|
|
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
|
|
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
|
|
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
|
|
if (dn->dn_objset->os_dsl_dataset != NULL)
|
|
rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
|
|
#endif
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
|
|
dmu_objset_willuse_space(os, db->db.db_size, tx);
|
|
}
|
|
|
|
/*
|
|
* If this buffer is dirty in an old transaction group we need
|
|
* to make a copy of it so that the changes we make in this
|
|
* transaction group won't leak out when we sync the older txg.
|
|
*/
|
|
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
|
|
list_link_init(&dr->dr_dirty_node);
|
|
list_link_init(&dr->dr_dbuf_node);
|
|
dr->dr_dnode = dn;
|
|
if (db->db_level == 0) {
|
|
void *data_old = db->db_buf;
|
|
|
|
if (db->db_state != DB_NOFILL) {
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
data_old = db->db.db_data;
|
|
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
|
|
/*
|
|
* Release the data buffer from the cache so
|
|
* that we can modify it without impacting
|
|
* possible other users of this cached data
|
|
* block. Note that indirect blocks and
|
|
* private objects are not released until the
|
|
* syncing state (since they are only modified
|
|
* then).
|
|
*/
|
|
arc_release(db->db_buf, db);
|
|
dbuf_fix_old_data(db, tx->tx_txg);
|
|
data_old = db->db_buf;
|
|
}
|
|
ASSERT(data_old != NULL);
|
|
}
|
|
dr->dt.dl.dr_data = data_old;
|
|
} else {
|
|
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
|
|
list_create(&dr->dt.di.dr_children,
|
|
sizeof (dbuf_dirty_record_t),
|
|
offsetof(dbuf_dirty_record_t, dr_dirty_node));
|
|
}
|
|
if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
|
|
dr->dr_accounted = db->db.db_size;
|
|
}
|
|
dr->dr_dbuf = db;
|
|
dr->dr_txg = tx->tx_txg;
|
|
list_insert_before(&db->db_dirty_records, dr_next, dr);
|
|
|
|
/*
|
|
* We could have been freed_in_flight between the dbuf_noread
|
|
* and dbuf_dirty. We win, as though the dbuf_noread() had
|
|
* happened after the free.
|
|
*/
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
|
db->db_blkid != DMU_SPILL_BLKID) {
|
|
mutex_enter(&dn->dn_mtx);
|
|
if (dn->dn_free_ranges[txgoff] != NULL) {
|
|
range_tree_clear(dn->dn_free_ranges[txgoff],
|
|
db->db_blkid, 1);
|
|
}
|
|
mutex_exit(&dn->dn_mtx);
|
|
db->db_freed_in_flight = FALSE;
|
|
}
|
|
|
|
/*
|
|
* This buffer is now part of this txg
|
|
*/
|
|
dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
|
|
db->db_dirtycnt += 1;
|
|
ASSERT3U(db->db_dirtycnt, <=, 3);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID ||
|
|
db->db_blkid == DMU_SPILL_BLKID) {
|
|
mutex_enter(&dn->dn_mtx);
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
mutex_exit(&dn->dn_mtx);
|
|
dnode_setdirty(dn, tx);
|
|
DB_DNODE_EXIT(db);
|
|
return (dr);
|
|
}
|
|
|
|
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
drop_struct_rwlock = B_TRUE;
|
|
}
|
|
|
|
/*
|
|
* If we are overwriting a dedup BP, then unless it is snapshotted,
|
|
* when we get to syncing context we will need to decrement its
|
|
* refcount in the DDT. Prefetch the relevant DDT block so that
|
|
* syncing context won't have to wait for the i/o.
|
|
*/
|
|
if (db->db_blkptr != NULL) {
|
|
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
|
ddt_prefetch(os->os_spa, db->db_blkptr);
|
|
dmu_buf_unlock_parent(db, dblt, FTAG);
|
|
}
|
|
|
|
/*
|
|
* We need to hold the dn_struct_rwlock to make this assertion,
|
|
* because it protects dn_phys / dn_next_nlevels from changing.
|
|
*/
|
|
ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
|
|
dn->dn_phys->dn_nlevels > db->db_level ||
|
|
dn->dn_next_nlevels[txgoff] > db->db_level ||
|
|
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
|
|
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
|
|
|
|
|
|
if (db->db_level == 0) {
|
|
ASSERT(!db->db_objset->os_raw_receive ||
|
|
dn->dn_maxblkid >= db->db_blkid);
|
|
dnode_new_blkid(dn, db->db_blkid, tx,
|
|
drop_struct_rwlock, B_FALSE);
|
|
ASSERT(dn->dn_maxblkid >= db->db_blkid);
|
|
}
|
|
|
|
if (db->db_level+1 < dn->dn_nlevels) {
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
dbuf_dirty_record_t *di;
|
|
int parent_held = FALSE;
|
|
|
|
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
parent = dbuf_hold_level(dn, db->db_level + 1,
|
|
db->db_blkid >> epbs, FTAG);
|
|
ASSERT(parent != NULL);
|
|
parent_held = TRUE;
|
|
}
|
|
if (drop_struct_rwlock)
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
ASSERT3U(db->db_level + 1, ==, parent->db_level);
|
|
di = dbuf_dirty(parent, tx);
|
|
if (parent_held)
|
|
dbuf_rele(parent, FTAG);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
/*
|
|
* Since we've dropped the mutex, it's possible that
|
|
* dbuf_undirty() might have changed this out from under us.
|
|
*/
|
|
if (list_head(&db->db_dirty_records) == dr ||
|
|
dn->dn_object == DMU_META_DNODE_OBJECT) {
|
|
mutex_enter(&di->dt.di.dr_mtx);
|
|
ASSERT3U(di->dr_txg, ==, tx->tx_txg);
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
list_insert_tail(&di->dt.di.dr_children, dr);
|
|
mutex_exit(&di->dt.di.dr_mtx);
|
|
dr->dr_parent = di;
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
} else {
|
|
ASSERT(db->db_level + 1 == dn->dn_nlevels);
|
|
ASSERT(db->db_blkid < dn->dn_nblkptr);
|
|
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
|
|
mutex_enter(&dn->dn_mtx);
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
|
|
mutex_exit(&dn->dn_mtx);
|
|
if (drop_struct_rwlock)
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
}
|
|
|
|
dnode_setdirty(dn, tx);
|
|
DB_DNODE_EXIT(db);
|
|
return (dr);
|
|
}
|
|
|
|
static void
|
|
dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
if (dr->dt.dl.dr_data != db->db.db_data) {
|
|
struct dnode *dn = dr->dr_dnode;
|
|
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
|
|
|
kmem_free(dr->dt.dl.dr_data, max_bonuslen);
|
|
arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
|
|
}
|
|
db->db_data_pending = NULL;
|
|
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
|
|
list_remove(&db->db_dirty_records, dr);
|
|
if (dr->dr_dbuf->db_level != 0) {
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
}
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
ASSERT3U(db->db_dirtycnt, >, 0);
|
|
db->db_dirtycnt -= 1;
|
|
}
|
|
|
|
/*
|
|
* Undirty a buffer in the transaction group referenced by the given
|
|
* transaction. Return whether this evicted the dbuf.
|
|
*/
|
|
boolean_t
|
|
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
{
|
|
uint64_t txg = tx->tx_txg;
|
|
boolean_t brtwrite;
|
|
boolean_t diowrite;
|
|
|
|
ASSERT(txg != 0);
|
|
|
|
/*
|
|
* Due to our use of dn_nlevels below, this can only be called
|
|
* in open context, unless we are operating on the MOS.
|
|
* From syncing context, dn_nlevels may be different from the
|
|
* dn_nlevels used when dbuf was dirtied.
|
|
*/
|
|
ASSERT(db->db_objset ==
|
|
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
|
|
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT0(db->db_level);
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
/*
|
|
* If this buffer is not dirty, we're done.
|
|
*/
|
|
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
|
|
if (dr == NULL)
|
|
return (B_FALSE);
|
|
ASSERT(dr->dr_dbuf == db);
|
|
|
|
brtwrite = dr->dt.dl.dr_brtwrite;
|
|
diowrite = dr->dt.dl.dr_diowrite;
|
|
if (brtwrite) {
|
|
ASSERT3B(diowrite, ==, B_FALSE);
|
|
/*
|
|
* We are freeing a block that we cloned in the same
|
|
* transaction group.
|
|
*/
|
|
brt_pending_remove(dmu_objset_spa(db->db_objset),
|
|
&dr->dt.dl.dr_overridden_by, tx);
|
|
}
|
|
|
|
dnode_t *dn = dr->dr_dnode;
|
|
|
|
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
|
|
|
|
ASSERT(db->db.db_size != 0);
|
|
|
|
dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
|
|
dr->dr_accounted, txg);
|
|
|
|
list_remove(&db->db_dirty_records, dr);
|
|
|
|
/*
|
|
* Note that there are three places in dbuf_dirty()
|
|
* where this dirty record may be put on a list.
|
|
* Make sure to do a list_remove corresponding to
|
|
* every one of those list_insert calls.
|
|
*/
|
|
if (dr->dr_parent) {
|
|
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
|
|
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
|
|
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
|
|
} else if (db->db_blkid == DMU_SPILL_BLKID ||
|
|
db->db_level + 1 == dn->dn_nlevels) {
|
|
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
|
|
mutex_enter(&dn->dn_mtx);
|
|
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
|
|
mutex_exit(&dn->dn_mtx);
|
|
}
|
|
|
|
if (db->db_state != DB_NOFILL && !brtwrite) {
|
|
dbuf_unoverride(dr);
|
|
|
|
if (dr->dt.dl.dr_data != db->db_buf) {
|
|
ASSERT(db->db_buf != NULL);
|
|
ASSERT(dr->dt.dl.dr_data != NULL);
|
|
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
|
}
|
|
}
|
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
db->db_dirtycnt -= 1;
|
|
|
|
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
|
|
ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
|
|
arc_released(db->db_buf));
|
|
dbuf_destroy(db);
|
|
return (B_TRUE);
|
|
}
|
|
|
|
return (B_FALSE);
|
|
}
|
|
|
|
static void
|
|
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
boolean_t undirty = B_FALSE;
|
|
|
|
ASSERT(tx->tx_txg != 0);
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
|
|
/*
|
|
* Quick check for dirtiness to improve performance for some workloads
|
|
* (e.g. file deletion with indirect blocks cached).
|
|
*/
|
|
mutex_enter(&db->db_mtx);
|
|
if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
|
|
/*
|
|
* It's possible that the dbuf is already dirty but not cached,
|
|
* because there are some calls to dbuf_dirty() that don't
|
|
* go through dmu_buf_will_dirty().
|
|
*/
|
|
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
|
if (dr != NULL) {
|
|
if (db->db_level == 0 &&
|
|
dr->dt.dl.dr_brtwrite) {
|
|
/*
|
|
* Block cloning: If we are dirtying a cloned
|
|
* level 0 block, we cannot simply redirty it,
|
|
* because this dr has no associated data.
|
|
* We will go through a full undirtying below,
|
|
* before dirtying it again.
|
|
*/
|
|
undirty = B_TRUE;
|
|
} else {
|
|
/* This dbuf is already dirty and cached. */
|
|
dbuf_redirty(dr);
|
|
mutex_exit(&db->db_mtx);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
|
|
flags |= DB_RF_HAVESTRUCT;
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
|
|
* want to make sure dbuf_read() will read the pending cloned block and
|
|
* not the uderlying block that is being replaced. dbuf_undirty() will
|
|
* do brt_pending_remove() before removing the dirty record.
|
|
*/
|
|
(void) dbuf_read(db, NULL, flags);
|
|
if (undirty) {
|
|
mutex_enter(&db->db_mtx);
|
|
VERIFY(!dbuf_undirty(db, tx));
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
(void) dbuf_dirty(db, tx);
|
|
}
|
|
|
|
void
|
|
dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_will_dirty_impl(db_fake,
|
|
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
|
|
}
|
|
|
|
boolean_t
|
|
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
|
mutex_exit(&db->db_mtx);
|
|
return (dr != NULL);
|
|
}
|
|
|
|
/*
|
|
* Normally the db_blkptr points to the most recent on-disk content for the
|
|
* dbuf (and anything newer will be cached in the dbuf). However, a pending
|
|
* block clone or not yet synced Direct I/O write will have a dirty record BP
|
|
* pointing to the most recent data.
|
|
*/
|
|
int
|
|
dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
|
|
{
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
int error = 0;
|
|
|
|
if (db->db_level != 0) {
|
|
*bp = db->db_blkptr;
|
|
return (0);
|
|
}
|
|
|
|
*bp = db->db_blkptr;
|
|
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
|
if (dr && db->db_state == DB_NOFILL) {
|
|
/* Block clone */
|
|
if (!dr->dt.dl.dr_brtwrite)
|
|
error = EIO;
|
|
else
|
|
*bp = &dr->dt.dl.dr_overridden_by;
|
|
} else if (dr && db->db_state == DB_UNCACHED) {
|
|
/* Direct I/O write */
|
|
if (dr->dt.dl.dr_diowrite)
|
|
*bp = &dr->dt.dl.dr_overridden_by;
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Direct I/O reads can read directly from the ARC, but the data has
|
|
* to be untransformed in order to copy it over into user pages.
|
|
*/
|
|
int
|
|
dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
|
|
{
|
|
int err = 0;
|
|
DB_DNODE_ENTER(db);
|
|
dnode_t *dn = DB_DNODE(db);
|
|
|
|
ASSERT3S(db->db_state, ==, DB_CACHED);
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
/*
|
|
* Ensure that this block's dnode has been decrypted if
|
|
* the caller has requested decrypted data.
|
|
*/
|
|
err = dbuf_read_verify_dnode_crypt(db, dn, 0);
|
|
|
|
/*
|
|
* If the arc buf is compressed or encrypted and the caller
|
|
* requested uncompressed data, we need to untransform it
|
|
* before returning. We also call arc_untransform() on any
|
|
* unauthenticated blocks, which will verify their MAC if
|
|
* the key is now available.
|
|
*/
|
|
if (err == 0 && db->db_buf != NULL &&
|
|
(arc_is_encrypted(db->db_buf) ||
|
|
arc_is_unauthenticated(db->db_buf) ||
|
|
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
|
zbookmark_phys_t zb;
|
|
|
|
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
dbuf_fix_old_data(db, spa_syncing_txg(spa));
|
|
err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
|
|
dbuf_set_data(db, db->db_buf);
|
|
}
|
|
DB_DNODE_EXIT(db);
|
|
DBUF_STAT_BUMP(hash_hits);
|
|
|
|
return (err);
|
|
}
|
|
|
|
void
|
|
dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
{
|
|
/*
|
|
* Block clones and Direct I/O writes always happen in open-context.
|
|
*/
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
ASSERT0(db->db_level);
|
|
ASSERT(!dmu_tx_is_syncing(tx));
|
|
ASSERT0(db->db_level);
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
DBUF_VERIFY(db);
|
|
|
|
/*
|
|
* We are going to clone or issue a Direct I/O write on this block, so
|
|
* undirty modifications done to this block so far in this txg. This
|
|
* includes writes and clones into this block.
|
|
*
|
|
* If there dirty record associated with this txg from a previous Direct
|
|
* I/O write then space accounting cleanup takes place. It is important
|
|
* to go ahead free up the space accounting through dbuf_undirty() ->
|
|
* dbuf_unoverride() -> zio_free(). Space accountiung for determining
|
|
* if a write can occur in zfs_write() happens through dmu_tx_assign().
|
|
* This can cause an issue with Direct I/O writes in the case of
|
|
* overwriting the same block, because all DVA allocations are being
|
|
* done in open-context. Constantly allowing Direct I/O overwrites to
|
|
* the same block can exhaust the pools available space leading to
|
|
* ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
|
|
* will eventually suspend the pool. By cleaning up sapce acccounting
|
|
* now, the ENOSPC error can be avoided.
|
|
*
|
|
* Since we are undirtying the record in open-context, we must have a
|
|
* hold on the db, so it should never be evicted after calling
|
|
* dbuf_undirty().
|
|
*/
|
|
VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
|
|
ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
|
|
|
|
if (db->db_buf != NULL) {
|
|
/*
|
|
* If there is an associated ARC buffer with this dbuf we can
|
|
* only destroy it if the previous dirty record does not
|
|
* reference it.
|
|
*/
|
|
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
|
if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
|
|
arc_buf_destroy(db->db_buf, db);
|
|
|
|
/*
|
|
* Setting the dbuf's data pointers to NULL will force all
|
|
* future reads down to the devices to get the most up to date
|
|
* version of the data after a Direct I/O write has completed.
|
|
*/
|
|
db->db_buf = NULL;
|
|
dbuf_clear_data(db);
|
|
}
|
|
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
|
ASSERT3P(db->db.db_data, ==, NULL);
|
|
|
|
db->db_state = DB_NOFILL;
|
|
DTRACE_SET_STATE(db,
|
|
"allocating NOFILL buffer for clone or direct I/O write");
|
|
|
|
DBUF_VERIFY(db);
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_noread(db);
|
|
(void) dbuf_dirty(db, tx);
|
|
}
|
|
|
|
void
|
|
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
db->db_state = DB_NOFILL;
|
|
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_noread(db);
|
|
(void) dbuf_dirty(db, tx);
|
|
}
|
|
|
|
void
|
|
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT(tx->tx_txg != 0);
|
|
ASSERT(db->db_level == 0);
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
|
|
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
|
|
dmu_tx_private_ok(tx));
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
|
if (db->db_state == DB_NOFILL ||
|
|
(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
|
|
/*
|
|
* If the fill can fail we should have a way to return back to
|
|
* the cloned or Direct I/O write data.
|
|
*/
|
|
if (canfail && dr) {
|
|
mutex_exit(&db->db_mtx);
|
|
dmu_buf_will_dirty(db_fake, tx);
|
|
return;
|
|
}
|
|
/*
|
|
* Block cloning: We will be completely overwriting a block
|
|
* cloned in this transaction group, so let's undirty the
|
|
* pending clone and mark the block as uncached. This will be
|
|
* as if the clone was never done.
|
|
*/
|
|
if (dr && dr->dt.dl.dr_brtwrite) {
|
|
VERIFY(!dbuf_undirty(db, tx));
|
|
db->db_state = DB_UNCACHED;
|
|
}
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_noread(db);
|
|
(void) dbuf_dirty(db, tx);
|
|
}
|
|
|
|
/*
|
|
* This function is effectively the same as dmu_buf_will_dirty(), but
|
|
* indicates the caller expects raw encrypted data in the db, and provides
|
|
* the crypt params (byteorder, salt, iv, mac) which should be stored in the
|
|
* blkptr_t when this dbuf is written. This is only used for blocks of
|
|
* dnodes, during raw receive.
|
|
*/
|
|
void
|
|
dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
|
|
const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
/*
|
|
* dr_has_raw_params is only processed for blocks of dnodes
|
|
* (see dbuf_sync_dnode_leaf_crypt()).
|
|
*/
|
|
ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
|
|
ASSERT3U(db->db_level, ==, 0);
|
|
ASSERT(db->db_objset->os_raw_receive);
|
|
|
|
dmu_buf_will_dirty_impl(db_fake,
|
|
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
|
|
|
|
dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
|
|
|
ASSERT3P(dr, !=, NULL);
|
|
|
|
dr->dt.dl.dr_has_raw_params = B_TRUE;
|
|
dr->dt.dl.dr_byteorder = byteorder;
|
|
memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
|
|
memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
|
|
memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
|
|
}
|
|
|
|
static void
|
|
dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
|
|
{
|
|
struct dirty_leaf *dl;
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
dr = list_head(&db->db_dirty_records);
|
|
ASSERT3P(dr, !=, NULL);
|
|
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
|
|
dl = &dr->dt.dl;
|
|
dl->dr_overridden_by = *bp;
|
|
dl->dr_override_state = DR_OVERRIDDEN;
|
|
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
|
|
}
|
|
|
|
boolean_t
|
|
dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
|
|
{
|
|
(void) tx;
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
mutex_enter(&db->db_mtx);
|
|
DBUF_VERIFY(db);
|
|
|
|
if (db->db_state == DB_FILL) {
|
|
if (db->db_level == 0 && db->db_freed_in_flight) {
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
/* we were freed while filling */
|
|
/* XXX dbuf_undirty? */
|
|
memset(db->db.db_data, 0, db->db.db_size);
|
|
db->db_freed_in_flight = FALSE;
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db,
|
|
"fill done handling freed in flight");
|
|
failed = B_FALSE;
|
|
} else if (failed) {
|
|
VERIFY(!dbuf_undirty(db, tx));
|
|
arc_buf_destroy(db->db_buf, db);
|
|
db->db_buf = NULL;
|
|
dbuf_clear_data(db);
|
|
DTRACE_SET_STATE(db, "fill failed");
|
|
} else {
|
|
db->db_state = DB_CACHED;
|
|
DTRACE_SET_STATE(db, "fill done");
|
|
}
|
|
cv_broadcast(&db->db_changed);
|
|
} else {
|
|
db->db_state = DB_CACHED;
|
|
failed = B_FALSE;
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
return (failed);
|
|
}
|
|
|
|
void
|
|
dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
|
|
bp_embedded_type_t etype, enum zio_compress comp,
|
|
int uncompressed_size, int compressed_size, int byteorder,
|
|
dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
struct dirty_leaf *dl;
|
|
dmu_object_type_t type;
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
if (etype == BP_EMBEDDED_TYPE_DATA) {
|
|
ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
|
|
SPA_FEATURE_EMBEDDED_DATA));
|
|
}
|
|
|
|
DB_DNODE_ENTER(db);
|
|
type = DB_DNODE(db)->dn_type;
|
|
DB_DNODE_EXIT(db);
|
|
|
|
ASSERT0(db->db_level);
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
|
|
dmu_buf_will_not_fill(dbuf, tx);
|
|
|
|
dr = list_head(&db->db_dirty_records);
|
|
ASSERT3P(dr, !=, NULL);
|
|
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
|
|
dl = &dr->dt.dl;
|
|
encode_embedded_bp_compressed(&dl->dr_overridden_by,
|
|
data, comp, uncompressed_size, compressed_size);
|
|
BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
|
|
BP_SET_TYPE(&dl->dr_overridden_by, type);
|
|
BP_SET_LEVEL(&dl->dr_overridden_by, 0);
|
|
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
|
|
|
|
dl->dr_override_state = DR_OVERRIDDEN;
|
|
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
|
|
}
|
|
|
|
void
|
|
dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
|
|
dmu_object_type_t type;
|
|
ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
|
|
SPA_FEATURE_REDACTED_DATASETS));
|
|
|
|
DB_DNODE_ENTER(db);
|
|
type = DB_DNODE(db)->dn_type;
|
|
DB_DNODE_EXIT(db);
|
|
|
|
ASSERT0(db->db_level);
|
|
dmu_buf_will_not_fill(dbuf, tx);
|
|
|
|
blkptr_t bp = { { { {0} } } };
|
|
BP_SET_TYPE(&bp, type);
|
|
BP_SET_LEVEL(&bp, 0);
|
|
BP_SET_BIRTH(&bp, tx->tx_txg, 0);
|
|
BP_SET_REDACTED(&bp);
|
|
BPE_SET_LSIZE(&bp, dbuf->db_size);
|
|
|
|
dbuf_override_impl(db, &bp, tx);
|
|
}
|
|
|
|
/*
|
|
* Directly assign a provided arc buf to a given dbuf if it's not referenced
|
|
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
|
|
*/
|
|
void
|
|
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
|
|
{
|
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT(db->db_level == 0);
|
|
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
|
|
ASSERT(buf != NULL);
|
|
ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
|
|
ASSERT(tx->tx_txg != 0);
|
|
|
|
arc_return_buf(buf, db);
|
|
ASSERT(arc_released(buf));
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
while (db->db_state == DB_READ || db->db_state == DB_FILL)
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
|
|
db->db_state == DB_NOFILL);
|
|
|
|
if (db->db_state == DB_CACHED &&
|
|
zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
|
|
/*
|
|
* In practice, we will never have a case where we have an
|
|
* encrypted arc buffer while additional holds exist on the
|
|
* dbuf. We don't handle this here so we simply assert that
|
|
* fact instead.
|
|
*/
|
|
ASSERT(!arc_is_encrypted(buf));
|
|
mutex_exit(&db->db_mtx);
|
|
(void) dbuf_dirty(db, tx);
|
|
memcpy(db->db.db_data, buf->b_data, db->db.db_size);
|
|
arc_buf_destroy(buf, db);
|
|
return;
|
|
}
|
|
|
|
if (db->db_state == DB_CACHED) {
|
|
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
|
|
|
ASSERT(db->db_buf != NULL);
|
|
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
|
|
ASSERT(dr->dt.dl.dr_data == db->db_buf);
|
|
|
|
if (!arc_released(db->db_buf)) {
|
|
ASSERT(dr->dt.dl.dr_override_state ==
|
|
DR_OVERRIDDEN);
|
|
arc_release(db->db_buf, db);
|
|
}
|
|
dr->dt.dl.dr_data = buf;
|
|
arc_buf_destroy(db->db_buf, db);
|
|
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
|
|
arc_release(db->db_buf, db);
|
|
arc_buf_destroy(db->db_buf, db);
|
|
}
|
|
db->db_buf = NULL;
|
|
} else if (db->db_state == DB_NOFILL) {
|
|
/*
|
|
* We will be completely replacing the cloned block. In case
|
|
* it was cloned in this transaction group, let's undirty the
|
|
* pending clone and mark the block as uncached. This will be
|
|
* as if the clone was never done.
|
|
*/
|
|
VERIFY(!dbuf_undirty(db, tx));
|
|
db->db_state = DB_UNCACHED;
|
|
}
|
|
ASSERT(db->db_buf == NULL);
|
|
dbuf_set_data(db, buf);
|
|
db->db_state = DB_FILL;
|
|
DTRACE_SET_STATE(db, "filling assigned arcbuf");
|
|
mutex_exit(&db->db_mtx);
|
|
(void) dbuf_dirty(db, tx);
|
|
dmu_buf_fill_done(&db->db, tx, B_FALSE);
|
|
}
|
|
|
|
void
|
|
dbuf_destroy(dmu_buf_impl_t *db)
|
|
{
|
|
dnode_t *dn;
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
dmu_buf_impl_t *dndb;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(zfs_refcount_is_zero(&db->db_holds));
|
|
|
|
if (db->db_buf != NULL) {
|
|
arc_buf_destroy(db->db_buf, db);
|
|
db->db_buf = NULL;
|
|
}
|
|
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
int slots = DB_DNODE(db)->dn_num_slots;
|
|
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
|
|
if (db->db.db_data != NULL) {
|
|
kmem_free(db->db.db_data, bonuslen);
|
|
arc_space_return(bonuslen, ARC_SPACE_BONUS);
|
|
db->db_state = DB_UNCACHED;
|
|
DTRACE_SET_STATE(db, "buffer cleared");
|
|
}
|
|
}
|
|
|
|
dbuf_clear_data(db);
|
|
|
|
if (multilist_link_active(&db->db_cache_link)) {
|
|
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
|
|
db->db_caching_status == DB_DBUF_METADATA_CACHE);
|
|
|
|
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
|
|
|
|
ASSERT0(dmu_buf_user_size(&db->db));
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[db->db_caching_status].size,
|
|
db->db.db_size, db);
|
|
|
|
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
|
|
DBUF_STAT_BUMPDOWN(metadata_cache_count);
|
|
} else {
|
|
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
|
|
DBUF_STAT_BUMPDOWN(cache_count);
|
|
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
|
|
db->db.db_size);
|
|
}
|
|
db->db_caching_status = DB_NO_CACHE;
|
|
}
|
|
|
|
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
|
ASSERT(db->db_data_pending == NULL);
|
|
ASSERT(list_is_empty(&db->db_dirty_records));
|
|
|
|
db->db_state = DB_EVICTING;
|
|
DTRACE_SET_STATE(db, "buffer eviction started");
|
|
db->db_blkptr = NULL;
|
|
|
|
/*
|
|
* Now that db_state is DB_EVICTING, nobody else can find this via
|
|
* the hash table. We can now drop db_mtx, which allows us to
|
|
* acquire the dn_dbufs_mtx.
|
|
*/
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
dndb = dn->dn_dbuf;
|
|
if (db->db_blkid != DMU_BONUS_BLKID) {
|
|
boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
|
|
if (needlock)
|
|
mutex_enter_nested(&dn->dn_dbufs_mtx,
|
|
NESTED_SINGLE);
|
|
avl_remove(&dn->dn_dbufs, db);
|
|
membar_producer();
|
|
DB_DNODE_EXIT(db);
|
|
if (needlock)
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
/*
|
|
* Decrementing the dbuf count means that the hold corresponding
|
|
* to the removed dbuf is no longer discounted in dnode_move(),
|
|
* so the dnode cannot be moved until after we release the hold.
|
|
* The membar_producer() ensures visibility of the decremented
|
|
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
|
|
* release any lock.
|
|
*/
|
|
mutex_enter(&dn->dn_mtx);
|
|
dnode_rele_and_unlock(dn, db, B_TRUE);
|
|
#ifdef USE_DNODE_HANDLE
|
|
db->db_dnode_handle = NULL;
|
|
#else
|
|
db->db_dnode = NULL;
|
|
#endif
|
|
|
|
dbuf_hash_remove(db);
|
|
} else {
|
|
DB_DNODE_EXIT(db);
|
|
}
|
|
|
|
ASSERT(zfs_refcount_is_zero(&db->db_holds));
|
|
|
|
db->db_parent = NULL;
|
|
|
|
ASSERT(db->db_buf == NULL);
|
|
ASSERT(db->db.db_data == NULL);
|
|
ASSERT(db->db_hash_next == NULL);
|
|
ASSERT(db->db_blkptr == NULL);
|
|
ASSERT(db->db_data_pending == NULL);
|
|
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
|
ASSERT(!multilist_link_active(&db->db_cache_link));
|
|
|
|
/*
|
|
* If this dbuf is referenced from an indirect dbuf,
|
|
* decrement the ref count on the indirect dbuf.
|
|
*/
|
|
if (parent && parent != dndb) {
|
|
mutex_enter(&parent->db_mtx);
|
|
dbuf_rele_and_unlock(parent, db, B_TRUE);
|
|
}
|
|
|
|
kmem_cache_free(dbuf_kmem_cache, db);
|
|
arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
|
}
|
|
|
|
/*
|
|
* Note: While bpp will always be updated if the function returns success,
|
|
* parentp will not be updated if the dnode does not have dn_dbuf filled in;
|
|
* this happens when the dnode is the meta-dnode, or {user|group|project}used
|
|
* object.
|
|
*/
|
|
__attribute__((always_inline))
|
|
static inline int
|
|
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
|
|
dmu_buf_impl_t **parentp, blkptr_t **bpp)
|
|
{
|
|
*parentp = NULL;
|
|
*bpp = NULL;
|
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
|
|
|
if (blkid == DMU_SPILL_BLKID) {
|
|
mutex_enter(&dn->dn_mtx);
|
|
if (dn->dn_have_spill &&
|
|
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
|
|
*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
|
|
else
|
|
*bpp = NULL;
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
*parentp = dn->dn_dbuf;
|
|
mutex_exit(&dn->dn_mtx);
|
|
return (0);
|
|
}
|
|
|
|
int nlevels =
|
|
(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
ASSERT3U(level * epbs, <, 64);
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
/*
|
|
* This assertion shouldn't trip as long as the max indirect block size
|
|
* is less than 1M. The reason for this is that up to that point,
|
|
* the number of levels required to address an entire object with blocks
|
|
* of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In
|
|
* other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
|
|
* (i.e. we can address the entire object), objects will all use at most
|
|
* N-1 levels and the assertion won't overflow. However, once epbs is
|
|
* 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be
|
|
* enough to address an entire object, so objects will have 5 levels,
|
|
* but then this assertion will overflow.
|
|
*
|
|
* All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
|
|
* need to redo this logic to handle overflows.
|
|
*/
|
|
ASSERT(level >= nlevels ||
|
|
((nlevels - level - 1) * epbs) +
|
|
highbit64(dn->dn_phys->dn_nblkptr) <= 64);
|
|
if (level >= nlevels ||
|
|
blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
|
|
((nlevels - level - 1) * epbs)) ||
|
|
(fail_sparse &&
|
|
blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
|
|
/* the buffer has no parent yet */
|
|
return (SET_ERROR(ENOENT));
|
|
} else if (level < nlevels-1) {
|
|
/* this block is referenced from an indirect block */
|
|
int err;
|
|
|
|
err = dbuf_hold_impl(dn, level + 1,
|
|
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
|
|
|
|
if (err)
|
|
return (err);
|
|
err = dbuf_read(*parentp, NULL,
|
|
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
|
|
if (err) {
|
|
dbuf_rele(*parentp, NULL);
|
|
*parentp = NULL;
|
|
return (err);
|
|
}
|
|
rw_enter(&(*parentp)->db_rwlock, RW_READER);
|
|
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
|
|
(blkid & ((1ULL << epbs) - 1));
|
|
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
|
|
ASSERT(BP_IS_HOLE(*bpp));
|
|
rw_exit(&(*parentp)->db_rwlock);
|
|
return (0);
|
|
} else {
|
|
/* the block is referenced from the dnode */
|
|
ASSERT3U(level, ==, nlevels-1);
|
|
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
|
|
blkid < dn->dn_phys->dn_nblkptr);
|
|
if (dn->dn_dbuf) {
|
|
dbuf_add_ref(dn->dn_dbuf, NULL);
|
|
*parentp = dn->dn_dbuf;
|
|
}
|
|
*bpp = &dn->dn_phys->dn_blkptr[blkid];
|
|
return (0);
|
|
}
|
|
}
|
|
|
|
static dmu_buf_impl_t *
|
|
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
|
|
{
|
|
objset_t *os = dn->dn_objset;
|
|
dmu_buf_impl_t *db, *odb;
|
|
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
ASSERT(dn->dn_type != DMU_OT_NONE);
|
|
|
|
db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
|
|
|
|
list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
|
|
offsetof(dbuf_dirty_record_t, dr_dbuf_node));
|
|
|
|
db->db_objset = os;
|
|
db->db.db_object = dn->dn_object;
|
|
db->db_level = level;
|
|
db->db_blkid = blkid;
|
|
db->db_dirtycnt = 0;
|
|
#ifdef USE_DNODE_HANDLE
|
|
db->db_dnode_handle = dn->dn_handle;
|
|
#else
|
|
db->db_dnode = dn;
|
|
#endif
|
|
db->db_parent = parent;
|
|
db->db_blkptr = blkptr;
|
|
db->db_hash = hash;
|
|
|
|
db->db_user = NULL;
|
|
db->db_user_immediate_evict = FALSE;
|
|
db->db_freed_in_flight = FALSE;
|
|
db->db_pending_evict = FALSE;
|
|
|
|
if (blkid == DMU_BONUS_BLKID) {
|
|
ASSERT3P(parent, ==, dn->dn_dbuf);
|
|
db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
|
|
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
|
|
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
|
|
db->db.db_offset = DMU_BONUS_BLKID;
|
|
db->db_state = DB_UNCACHED;
|
|
DTRACE_SET_STATE(db, "bonus buffer created");
|
|
db->db_caching_status = DB_NO_CACHE;
|
|
/* the bonus dbuf is not placed in the hash table */
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
|
return (db);
|
|
} else if (blkid == DMU_SPILL_BLKID) {
|
|
db->db.db_size = (blkptr != NULL) ?
|
|
BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
|
|
db->db.db_offset = 0;
|
|
} else {
|
|
int blocksize =
|
|
db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
|
|
db->db.db_size = blocksize;
|
|
db->db.db_offset = db->db_blkid * blocksize;
|
|
}
|
|
|
|
/*
|
|
* Hold the dn_dbufs_mtx while we get the new dbuf
|
|
* in the hash table *and* added to the dbufs list.
|
|
* This prevents a possible deadlock with someone
|
|
* trying to look up this dbuf before it's added to the
|
|
* dn_dbufs list.
|
|
*/
|
|
mutex_enter(&dn->dn_dbufs_mtx);
|
|
db->db_state = DB_EVICTING; /* not worth logging this state change */
|
|
if ((odb = dbuf_hash_insert(db)) != NULL) {
|
|
/* someone else inserted it first */
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
kmem_cache_free(dbuf_kmem_cache, db);
|
|
DBUF_STAT_BUMP(hash_insert_race);
|
|
return (odb);
|
|
}
|
|
avl_add(&dn->dn_dbufs, db);
|
|
|
|
db->db_state = DB_UNCACHED;
|
|
DTRACE_SET_STATE(db, "regular buffer created");
|
|
db->db_caching_status = DB_NO_CACHE;
|
|
mutex_exit(&dn->dn_dbufs_mtx);
|
|
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
|
|
|
|
if (parent && parent != dn->dn_dbuf)
|
|
dbuf_add_ref(parent, db);
|
|
|
|
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
|
|
zfs_refcount_count(&dn->dn_holds) > 0);
|
|
(void) zfs_refcount_add(&dn->dn_holds, db);
|
|
|
|
dprintf_dbuf(db, "db=%p\n", db);
|
|
|
|
return (db);
|
|
}
|
|
|
|
/*
|
|
* This function returns a block pointer and information about the object,
|
|
* given a dnode and a block. This is a publicly accessible version of
|
|
* dbuf_findbp that only returns some information, rather than the
|
|
* dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock
|
|
* should be locked as (at least) a reader.
|
|
*/
|
|
int
|
|
dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
|
|
blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
|
|
{
|
|
dmu_buf_impl_t *dbp = NULL;
|
|
blkptr_t *bp2;
|
|
int err = 0;
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
|
|
if (err == 0) {
|
|
ASSERT3P(bp2, !=, NULL);
|
|
*bp = *bp2;
|
|
if (dbp != NULL)
|
|
dbuf_rele(dbp, NULL);
|
|
if (datablkszsec != NULL)
|
|
*datablkszsec = dn->dn_phys->dn_datablkszsec;
|
|
if (indblkshift != NULL)
|
|
*indblkshift = dn->dn_phys->dn_indblkshift;
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
typedef struct dbuf_prefetch_arg {
|
|
spa_t *dpa_spa; /* The spa to issue the prefetch in. */
|
|
zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
|
|
int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
|
|
int dpa_curlevel; /* The current level that we're reading */
|
|
dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
|
|
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
|
|
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
|
|
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
|
|
dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
|
|
void *dpa_arg; /* prefetch completion arg */
|
|
} dbuf_prefetch_arg_t;
|
|
|
|
static void
|
|
dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
|
|
{
|
|
if (dpa->dpa_cb != NULL) {
|
|
dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
|
|
dpa->dpa_zb.zb_blkid, io_done);
|
|
}
|
|
kmem_free(dpa, sizeof (*dpa));
|
|
}
|
|
|
|
static void
|
|
dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
|
|
{
|
|
(void) zio, (void) zb, (void) iobp;
|
|
dbuf_prefetch_arg_t *dpa = private;
|
|
|
|
if (abuf != NULL)
|
|
arc_buf_destroy(abuf, private);
|
|
|
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
|
}
|
|
|
|
/*
|
|
* Actually issue the prefetch read for the block given.
|
|
*/
|
|
static void
|
|
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
|
{
|
|
ASSERT(!BP_IS_REDACTED(bp) ||
|
|
dsl_dataset_feature_is_active(
|
|
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
|
|
SPA_FEATURE_REDACTED_DATASETS));
|
|
|
|
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
|
|
return (dbuf_prefetch_fini(dpa, B_FALSE));
|
|
|
|
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
|
|
arc_flags_t aflags =
|
|
dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
|
|
ARC_FLAG_NO_BUF;
|
|
|
|
/* dnodes are always read as raw and then converted later */
|
|
if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
|
|
dpa->dpa_curlevel == 0)
|
|
zio_flags |= ZIO_FLAG_RAW;
|
|
|
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
|
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
|
|
ASSERT(dpa->dpa_zio != NULL);
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
|
|
dbuf_issue_final_prefetch_done, dpa,
|
|
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
|
|
}
|
|
|
|
/*
|
|
* Called when an indirect block above our prefetch target is read in. This
|
|
* will either read in the next indirect block down the tree or issue the actual
|
|
* prefetch if the next block down is our target.
|
|
*/
|
|
static void
|
|
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
|
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
|
|
{
|
|
(void) zb, (void) iobp;
|
|
dbuf_prefetch_arg_t *dpa = private;
|
|
|
|
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
|
|
ASSERT3S(dpa->dpa_curlevel, >, 0);
|
|
|
|
if (abuf == NULL) {
|
|
ASSERT(zio == NULL || zio->io_error != 0);
|
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
|
return;
|
|
}
|
|
ASSERT(zio == NULL || zio->io_error == 0);
|
|
|
|
/*
|
|
* The dpa_dnode is only valid if we are called with a NULL
|
|
* zio. This indicates that the arc_read() returned without
|
|
* first calling zio_read() to issue a physical read. Once
|
|
* a physical read is made the dpa_dnode must be invalidated
|
|
* as the locks guarding it may have been dropped. If the
|
|
* dpa_dnode is still valid, then we want to add it to the dbuf
|
|
* cache. To do so, we must hold the dbuf associated with the block
|
|
* we just prefetched, read its contents so that we associate it
|
|
* with an arc_buf_t, and then release it.
|
|
*/
|
|
if (zio != NULL) {
|
|
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
|
|
if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
|
|
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
|
|
} else {
|
|
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
|
|
}
|
|
ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
|
|
|
|
dpa->dpa_dnode = NULL;
|
|
} else if (dpa->dpa_dnode != NULL) {
|
|
uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
|
|
(dpa->dpa_epbs * (dpa->dpa_curlevel -
|
|
dpa->dpa_zb.zb_level));
|
|
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
|
|
dpa->dpa_curlevel, curblkid, FTAG);
|
|
if (db == NULL) {
|
|
arc_buf_destroy(abuf, private);
|
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
|
return;
|
|
}
|
|
(void) dbuf_read(db, NULL,
|
|
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
|
|
dbuf_rele(db, FTAG);
|
|
}
|
|
|
|
dpa->dpa_curlevel--;
|
|
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
|
|
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
|
|
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
|
|
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
|
|
|
|
ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
|
|
dsl_dataset_feature_is_active(
|
|
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
|
|
SPA_FEATURE_REDACTED_DATASETS)));
|
|
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
|
|
arc_buf_destroy(abuf, private);
|
|
dbuf_prefetch_fini(dpa, B_TRUE);
|
|
return;
|
|
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
|
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
|
dbuf_issue_final_prefetch(dpa, bp);
|
|
} else {
|
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
|
zbookmark_phys_t zb;
|
|
|
|
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
|
|
if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
|
|
iter_aflags |= ARC_FLAG_L2CACHE;
|
|
|
|
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
|
|
|
|
SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
|
|
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
|
|
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
|
bp, dbuf_prefetch_indirect_done, dpa,
|
|
ZIO_PRIORITY_SYNC_READ,
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
&iter_aflags, &zb);
|
|
}
|
|
|
|
arc_buf_destroy(abuf, private);
|
|
}
|
|
|
|
/*
|
|
* Issue prefetch reads for the given block on the given level. If the indirect
|
|
* blocks above that block are not in memory, we will read them in
|
|
* asynchronously. As a result, this call never blocks waiting for a read to
|
|
* complete. Note that the prefetch might fail if the dataset is encrypted and
|
|
* the encryption key is unmapped before the IO completes.
|
|
*/
|
|
int
|
|
dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
|
|
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
|
|
void *arg)
|
|
{
|
|
blkptr_t bp;
|
|
int epbs, nlevels, curlevel;
|
|
uint64_t curblkid;
|
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
|
|
if (blkid > dn->dn_maxblkid)
|
|
goto no_issue;
|
|
|
|
if (level == 0 && dnode_block_freed(dn, blkid))
|
|
goto no_issue;
|
|
|
|
/*
|
|
* This dnode hasn't been written to disk yet, so there's nothing to
|
|
* prefetch.
|
|
*/
|
|
nlevels = dn->dn_phys->dn_nlevels;
|
|
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
|
|
goto no_issue;
|
|
|
|
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
|
|
goto no_issue;
|
|
|
|
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
|
|
level, blkid, NULL);
|
|
if (db != NULL) {
|
|
mutex_exit(&db->db_mtx);
|
|
/*
|
|
* This dbuf already exists. It is either CACHED, or
|
|
* (we assume) about to be read or filled.
|
|
*/
|
|
goto no_issue;
|
|
}
|
|
|
|
/*
|
|
* Find the closest ancestor (indirect block) of the target block
|
|
* that is present in the cache. In this indirect block, we will
|
|
* find the bp that is at curlevel, curblkid.
|
|
*/
|
|
curlevel = level;
|
|
curblkid = blkid;
|
|
while (curlevel < nlevels - 1) {
|
|
int parent_level = curlevel + 1;
|
|
uint64_t parent_blkid = curblkid >> epbs;
|
|
dmu_buf_impl_t *db;
|
|
|
|
if (dbuf_hold_impl(dn, parent_level, parent_blkid,
|
|
FALSE, TRUE, FTAG, &db) == 0) {
|
|
blkptr_t *bpp = db->db_buf->b_data;
|
|
bp = bpp[P2PHASE(curblkid, 1 << epbs)];
|
|
dbuf_rele(db, FTAG);
|
|
break;
|
|
}
|
|
|
|
curlevel = parent_level;
|
|
curblkid = parent_blkid;
|
|
}
|
|
|
|
if (curlevel == nlevels - 1) {
|
|
/* No cached indirect blocks found. */
|
|
ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
|
|
bp = dn->dn_phys->dn_blkptr[curblkid];
|
|
}
|
|
ASSERT(!BP_IS_REDACTED(&bp) ||
|
|
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
|
|
SPA_FEATURE_REDACTED_DATASETS));
|
|
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
|
|
goto no_issue;
|
|
|
|
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
|
|
|
|
zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
|
|
ZIO_FLAG_CANFAIL);
|
|
|
|
dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
|
|
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
|
|
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
|
dn->dn_object, level, blkid);
|
|
dpa->dpa_curlevel = curlevel;
|
|
dpa->dpa_prio = prio;
|
|
dpa->dpa_aflags = aflags;
|
|
dpa->dpa_spa = dn->dn_objset->os_spa;
|
|
dpa->dpa_dnode = dn;
|
|
dpa->dpa_epbs = epbs;
|
|
dpa->dpa_zio = pio;
|
|
dpa->dpa_cb = cb;
|
|
dpa->dpa_arg = arg;
|
|
|
|
if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
|
|
dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
|
|
else if (dnode_level_is_l2cacheable(&bp, dn, level))
|
|
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
|
|
|
|
/*
|
|
* If we have the indirect just above us, no need to do the asynchronous
|
|
* prefetch chain; we'll just run the last step ourselves. If we're at
|
|
* a higher level, though, we want to issue the prefetches for all the
|
|
* indirect blocks asynchronously, so we can go on with whatever we were
|
|
* doing.
|
|
*/
|
|
if (curlevel == level) {
|
|
ASSERT3U(curblkid, ==, blkid);
|
|
dbuf_issue_final_prefetch(dpa, &bp);
|
|
} else {
|
|
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
|
|
zbookmark_phys_t zb;
|
|
|
|
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
|
|
if (dnode_level_is_l2cacheable(&bp, dn, level))
|
|
iter_aflags |= ARC_FLAG_L2CACHE;
|
|
|
|
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
|
|
dn->dn_object, curlevel, curblkid);
|
|
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
|
|
&bp, dbuf_prefetch_indirect_done, dpa,
|
|
ZIO_PRIORITY_SYNC_READ,
|
|
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
|
|
&iter_aflags, &zb);
|
|
}
|
|
/*
|
|
* We use pio here instead of dpa_zio since it's possible that
|
|
* dpa may have already been freed.
|
|
*/
|
|
zio_nowait(pio);
|
|
return (1);
|
|
no_issue:
|
|
if (cb != NULL)
|
|
cb(arg, level, blkid, B_FALSE);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
|
|
arc_flags_t aflags)
|
|
{
|
|
|
|
return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
|
|
}
|
|
|
|
/*
|
|
* Helper function for dbuf_hold_impl() to copy a buffer. Handles
|
|
* the case of encrypted, compressed and uncompressed buffers by
|
|
* allocating the new buffer, respectively, with arc_alloc_raw_buf(),
|
|
* arc_alloc_compressed_buf() or arc_alloc_buf().*
|
|
*
|
|
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
|
|
*/
|
|
noinline static void
|
|
dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
|
|
{
|
|
dbuf_dirty_record_t *dr = db->db_data_pending;
|
|
arc_buf_t *data = dr->dt.dl.dr_data;
|
|
enum zio_compress compress_type = arc_get_compression(data);
|
|
uint8_t complevel = arc_get_complevel(data);
|
|
|
|
if (arc_is_encrypted(data)) {
|
|
boolean_t byteorder;
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
arc_get_raw_params(data, &byteorder, salt, iv, mac);
|
|
dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
|
|
dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
|
|
dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
|
|
compress_type, complevel));
|
|
} else if (compress_type != ZIO_COMPRESS_OFF) {
|
|
dbuf_set_data(db, arc_alloc_compressed_buf(
|
|
dn->dn_objset->os_spa, db, arc_buf_size(data),
|
|
arc_buf_lsize(data), compress_type, complevel));
|
|
} else {
|
|
dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
|
|
DBUF_GET_BUFC_TYPE(db), db->db.db_size));
|
|
}
|
|
|
|
rw_enter(&db->db_rwlock, RW_WRITER);
|
|
memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
|
|
rw_exit(&db->db_rwlock);
|
|
}
|
|
|
|
/*
|
|
* Returns with db_holds incremented, and db_mtx not held.
|
|
* Note: dn_struct_rwlock must be held.
|
|
*/
|
|
int
|
|
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
|
|
boolean_t fail_sparse, boolean_t fail_uncached,
|
|
const void *tag, dmu_buf_impl_t **dbp)
|
|
{
|
|
dmu_buf_impl_t *db, *parent = NULL;
|
|
uint64_t hv;
|
|
|
|
/* If the pool has been created, verify the tx_sync_lock is not held */
|
|
spa_t *spa = dn->dn_objset->os_spa;
|
|
dsl_pool_t *dp = spa->spa_dsl_pool;
|
|
if (dp != NULL) {
|
|
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
|
|
}
|
|
|
|
ASSERT(blkid != DMU_BONUS_BLKID);
|
|
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
|
|
ASSERT3U(dn->dn_nlevels, >, level);
|
|
|
|
*dbp = NULL;
|
|
|
|
/* dbuf_find() returns with db_mtx held */
|
|
db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
|
|
|
|
if (db == NULL) {
|
|
blkptr_t *bp = NULL;
|
|
int err;
|
|
|
|
if (fail_uncached)
|
|
return (SET_ERROR(ENOENT));
|
|
|
|
ASSERT3P(parent, ==, NULL);
|
|
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
|
|
if (fail_sparse) {
|
|
if (err == 0 && bp && BP_IS_HOLE(bp))
|
|
err = SET_ERROR(ENOENT);
|
|
if (err) {
|
|
if (parent)
|
|
dbuf_rele(parent, NULL);
|
|
return (err);
|
|
}
|
|
}
|
|
if (err && err != ENOENT)
|
|
return (err);
|
|
db = dbuf_create(dn, level, blkid, parent, bp, hv);
|
|
}
|
|
|
|
if (fail_uncached && db->db_state != DB_CACHED) {
|
|
mutex_exit(&db->db_mtx);
|
|
return (SET_ERROR(ENOENT));
|
|
}
|
|
|
|
if (db->db_buf != NULL) {
|
|
arc_buf_access(db->db_buf);
|
|
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
|
|
}
|
|
|
|
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
|
|
|
|
/*
|
|
* If this buffer is currently syncing out, and we are
|
|
* still referencing it from db_data, we need to make a copy
|
|
* of it in case we decide we want to dirty it again in this txg.
|
|
*/
|
|
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
|
dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
db->db_state == DB_CACHED && db->db_data_pending) {
|
|
dbuf_dirty_record_t *dr = db->db_data_pending;
|
|
if (dr->dt.dl.dr_data == db->db_buf) {
|
|
ASSERT3P(db->db_buf, !=, NULL);
|
|
dbuf_hold_copy(dn, db);
|
|
}
|
|
}
|
|
|
|
if (multilist_link_active(&db->db_cache_link)) {
|
|
ASSERT(zfs_refcount_is_zero(&db->db_holds));
|
|
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
|
|
db->db_caching_status == DB_DBUF_METADATA_CACHE);
|
|
|
|
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
|
|
|
|
uint64_t size = db->db.db_size;
|
|
uint64_t usize = dmu_buf_user_size(&db->db);
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[db->db_caching_status].size, size, db);
|
|
(void) zfs_refcount_remove_many(
|
|
&dbuf_caches[db->db_caching_status].size, usize,
|
|
db->db_user);
|
|
|
|
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
|
|
DBUF_STAT_BUMPDOWN(metadata_cache_count);
|
|
} else {
|
|
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
|
|
DBUF_STAT_BUMPDOWN(cache_count);
|
|
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
|
|
size + usize);
|
|
}
|
|
db->db_caching_status = DB_NO_CACHE;
|
|
}
|
|
(void) zfs_refcount_add(&db->db_holds, tag);
|
|
DBUF_VERIFY(db);
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
/* NOTE: we can't rele the parent until after we drop the db_mtx */
|
|
if (parent)
|
|
dbuf_rele(parent, NULL);
|
|
|
|
ASSERT3P(DB_DNODE(db), ==, dn);
|
|
ASSERT3U(db->db_blkid, ==, blkid);
|
|
ASSERT3U(db->db_level, ==, level);
|
|
*dbp = db;
|
|
|
|
return (0);
|
|
}
|
|
|
|
dmu_buf_impl_t *
|
|
dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
|
|
{
|
|
return (dbuf_hold_level(dn, 0, blkid, tag));
|
|
}
|
|
|
|
dmu_buf_impl_t *
|
|
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
|
|
{
|
|
dmu_buf_impl_t *db;
|
|
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
|
|
return (err ? NULL : db);
|
|
}
|
|
|
|
void
|
|
dbuf_create_bonus(dnode_t *dn)
|
|
{
|
|
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
|
|
|
|
ASSERT(dn->dn_bonus == NULL);
|
|
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
|
|
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
|
|
}
|
|
|
|
int
|
|
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
return (SET_ERROR(ENOTSUP));
|
|
if (blksz == 0)
|
|
blksz = SPA_MINBLOCKSIZE;
|
|
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
|
|
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
|
|
|
|
dbuf_new_size(db, blksz, tx);
|
|
|
|
return (0);
|
|
}
|
|
|
|
void
|
|
dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
|
|
{
|
|
dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
|
|
}
|
|
|
|
#pragma weak dmu_buf_add_ref = dbuf_add_ref
|
|
void
|
|
dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
|
|
{
|
|
int64_t holds = zfs_refcount_add(&db->db_holds, tag);
|
|
VERIFY3S(holds, >, 1);
|
|
}
|
|
|
|
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
|
|
boolean_t
|
|
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
|
|
const void *tag)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
dmu_buf_impl_t *found_db;
|
|
boolean_t result = B_FALSE;
|
|
|
|
if (blkid == DMU_BONUS_BLKID)
|
|
found_db = dbuf_find_bonus(os, obj);
|
|
else
|
|
found_db = dbuf_find(os, obj, 0, blkid, NULL);
|
|
|
|
if (found_db != NULL) {
|
|
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
|
|
(void) zfs_refcount_add(&db->db_holds, tag);
|
|
result = B_TRUE;
|
|
}
|
|
mutex_exit(&found_db->db_mtx);
|
|
}
|
|
return (result);
|
|
}
|
|
|
|
/*
|
|
* If you call dbuf_rele() you had better not be referencing the dnode handle
|
|
* unless you have some other direct or indirect hold on the dnode. (An indirect
|
|
* hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
|
|
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
|
|
* dnode's parent dbuf evicting its dnode handles.
|
|
*/
|
|
void
|
|
dbuf_rele(dmu_buf_impl_t *db, const void *tag)
|
|
{
|
|
mutex_enter(&db->db_mtx);
|
|
dbuf_rele_and_unlock(db, tag, B_FALSE);
|
|
}
|
|
|
|
void
|
|
dmu_buf_rele(dmu_buf_t *db, const void *tag)
|
|
{
|
|
dbuf_rele((dmu_buf_impl_t *)db, tag);
|
|
}
|
|
|
|
/*
|
|
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
|
|
* db_dirtycnt and db_holds to be updated atomically. The 'evicting'
|
|
* argument should be set if we are already in the dbuf-evicting code
|
|
* path, in which case we don't want to recursively evict. This allows us to
|
|
* avoid deeply nested stacks that would have a call flow similar to this:
|
|
*
|
|
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
|
|
* ^ |
|
|
* | |
|
|
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
|
|
*
|
|
*/
|
|
void
|
|
dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
|
|
{
|
|
int64_t holds;
|
|
uint64_t size;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
DBUF_VERIFY(db);
|
|
|
|
/*
|
|
* Remove the reference to the dbuf before removing its hold on the
|
|
* dnode so we can guarantee in dnode_move() that a referenced bonus
|
|
* buffer has a corresponding dnode hold.
|
|
*/
|
|
holds = zfs_refcount_remove(&db->db_holds, tag);
|
|
ASSERT(holds >= 0);
|
|
|
|
/*
|
|
* We can't freeze indirects if there is a possibility that they
|
|
* may be modified in the current syncing context.
|
|
*/
|
|
if (db->db_buf != NULL &&
|
|
holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
|
|
arc_buf_freeze(db->db_buf);
|
|
}
|
|
|
|
if (holds == db->db_dirtycnt &&
|
|
db->db_level == 0 && db->db_user_immediate_evict)
|
|
dbuf_evict_user(db);
|
|
|
|
if (holds == 0) {
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
dnode_t *dn;
|
|
boolean_t evict_dbuf = db->db_pending_evict;
|
|
|
|
/*
|
|
* If the dnode moves here, we cannot cross this
|
|
* barrier until the move completes.
|
|
*/
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dn = DB_DNODE(db);
|
|
atomic_dec_32(&dn->dn_dbufs_count);
|
|
|
|
/*
|
|
* Decrementing the dbuf count means that the bonus
|
|
* buffer's dnode hold is no longer discounted in
|
|
* dnode_move(). The dnode cannot move until after
|
|
* the dnode_rele() below.
|
|
*/
|
|
DB_DNODE_EXIT(db);
|
|
|
|
/*
|
|
* Do not reference db after its lock is dropped.
|
|
* Another thread may evict it.
|
|
*/
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (evict_dbuf)
|
|
dnode_evict_bonus(dn);
|
|
|
|
dnode_rele(dn, db);
|
|
} else if (db->db_buf == NULL) {
|
|
/*
|
|
* This is a special case: we never associated this
|
|
* dbuf with any data allocated from the ARC.
|
|
*/
|
|
ASSERT(db->db_state == DB_UNCACHED ||
|
|
db->db_state == DB_NOFILL);
|
|
dbuf_destroy(db);
|
|
} else if (arc_released(db->db_buf)) {
|
|
/*
|
|
* This dbuf has anonymous data associated with it.
|
|
*/
|
|
dbuf_destroy(db);
|
|
} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
|
|
db->db_pending_evict) {
|
|
dbuf_destroy(db);
|
|
} else if (!multilist_link_active(&db->db_cache_link)) {
|
|
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
|
|
|
dbuf_cached_state_t dcs =
|
|
dbuf_include_in_metadata_cache(db) ?
|
|
DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
|
|
db->db_caching_status = dcs;
|
|
|
|
multilist_insert(&dbuf_caches[dcs].cache, db);
|
|
uint64_t db_size = db->db.db_size;
|
|
uint64_t dbu_size = dmu_buf_user_size(&db->db);
|
|
(void) zfs_refcount_add_many(
|
|
&dbuf_caches[dcs].size, db_size, db);
|
|
size = zfs_refcount_add_many(
|
|
&dbuf_caches[dcs].size, dbu_size, db->db_user);
|
|
uint8_t db_level = db->db_level;
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
if (dcs == DB_DBUF_METADATA_CACHE) {
|
|
DBUF_STAT_BUMP(metadata_cache_count);
|
|
DBUF_STAT_MAX(metadata_cache_size_bytes_max,
|
|
size);
|
|
} else {
|
|
DBUF_STAT_BUMP(cache_count);
|
|
DBUF_STAT_MAX(cache_size_bytes_max, size);
|
|
DBUF_STAT_BUMP(cache_levels[db_level]);
|
|
DBUF_STAT_INCR(cache_levels_bytes[db_level],
|
|
db_size + dbu_size);
|
|
}
|
|
|
|
if (dcs == DB_DBUF_CACHE && !evicting)
|
|
dbuf_evict_notify(size);
|
|
}
|
|
} else {
|
|
mutex_exit(&db->db_mtx);
|
|
}
|
|
}
|
|
|
|
#pragma weak dmu_buf_refcount = dbuf_refcount
|
|
uint64_t
|
|
dbuf_refcount(dmu_buf_impl_t *db)
|
|
{
|
|
return (zfs_refcount_count(&db->db_holds));
|
|
}
|
|
|
|
uint64_t
|
|
dmu_buf_user_refcount(dmu_buf_t *db_fake)
|
|
{
|
|
uint64_t holds;
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
|
|
holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return (holds);
|
|
}
|
|
|
|
void *
|
|
dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
|
|
dmu_buf_user_t *new_user)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
if (db->db_user == old_user)
|
|
db->db_user = new_user;
|
|
else
|
|
old_user = db->db_user;
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
return (old_user);
|
|
}
|
|
|
|
void *
|
|
dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
|
{
|
|
return (dmu_buf_replace_user(db_fake, NULL, user));
|
|
}
|
|
|
|
void *
|
|
dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
db->db_user_immediate_evict = TRUE;
|
|
return (dmu_buf_set_user(db_fake, user));
|
|
}
|
|
|
|
void *
|
|
dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
|
|
{
|
|
return (dmu_buf_replace_user(db_fake, user, NULL));
|
|
}
|
|
|
|
void *
|
|
dmu_buf_get_user(dmu_buf_t *db_fake)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
|
|
dbuf_verify_user(db, DBVU_NOT_EVICTING);
|
|
return (db->db_user);
|
|
}
|
|
|
|
uint64_t
|
|
dmu_buf_user_size(dmu_buf_t *db_fake)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
if (db->db_user == NULL)
|
|
return (0);
|
|
return (atomic_load_64(&db->db_user->dbu_size));
|
|
}
|
|
|
|
void
|
|
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
|
ASSERT3P(db->db_user, !=, NULL);
|
|
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
|
|
atomic_add_64(&db->db_user->dbu_size, nadd);
|
|
}
|
|
|
|
void
|
|
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
|
|
{
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
|
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
|
|
ASSERT3P(db->db_user, !=, NULL);
|
|
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
|
|
atomic_sub_64(&db->db_user->dbu_size, nsub);
|
|
}
|
|
|
|
void
|
|
dmu_buf_user_evict_wait(void)
|
|
{
|
|
taskq_wait(dbu_evict_taskq);
|
|
}
|
|
|
|
blkptr_t *
|
|
dmu_buf_get_blkptr(dmu_buf_t *db)
|
|
{
|
|
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
|
return (dbi->db_blkptr);
|
|
}
|
|
|
|
objset_t *
|
|
dmu_buf_get_objset(dmu_buf_t *db)
|
|
{
|
|
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
|
|
return (dbi->db_objset);
|
|
}
|
|
|
|
static void
|
|
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
|
|
{
|
|
/* ASSERT(dmu_tx_is_syncing(tx) */
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
|
|
if (db->db_blkptr != NULL)
|
|
return;
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
|
|
BP_ZERO(db->db_blkptr);
|
|
return;
|
|
}
|
|
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
|
|
/*
|
|
* This buffer was allocated at a time when there was
|
|
* no available blkptrs from the dnode, or it was
|
|
* inappropriate to hook it in (i.e., nlevels mismatch).
|
|
*/
|
|
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
|
|
ASSERT(db->db_parent == NULL);
|
|
db->db_parent = dn->dn_dbuf;
|
|
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
|
|
DBUF_VERIFY(db);
|
|
} else {
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
|
|
ASSERT(dn->dn_phys->dn_nlevels > 1);
|
|
if (parent == NULL) {
|
|
mutex_exit(&db->db_mtx);
|
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
|
parent = dbuf_hold_level(dn, db->db_level + 1,
|
|
db->db_blkid >> epbs, db);
|
|
rw_exit(&dn->dn_struct_rwlock);
|
|
mutex_enter(&db->db_mtx);
|
|
db->db_parent = parent;
|
|
}
|
|
db->db_blkptr = (blkptr_t *)parent->db.db_data +
|
|
(db->db_blkid & ((1ULL << epbs) - 1));
|
|
DBUF_VERIFY(db);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
void *data = dr->dt.dl.dr_data;
|
|
|
|
ASSERT0(db->db_level);
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT(db->db_blkid == DMU_BONUS_BLKID);
|
|
ASSERT(data != NULL);
|
|
|
|
dnode_t *dn = dr->dr_dnode;
|
|
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
|
|
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
|
|
memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
|
|
|
|
dbuf_sync_leaf_verify_bonus_dnode(dr);
|
|
|
|
dbuf_undirty_bonus(dr);
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
|
|
}
|
|
|
|
/*
|
|
* When syncing out a blocks of dnodes, adjust the block to deal with
|
|
* encryption. Normally, we make sure the block is decrypted before writing
|
|
* it. If we have crypt params, then we are writing a raw (encrypted) block,
|
|
* from a raw receive. In this case, set the ARC buf's crypt params so
|
|
* that the BP will be filled with the correct byteorder, salt, iv, and mac.
|
|
*/
|
|
static void
|
|
dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
|
|
{
|
|
int err;
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
|
ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
|
|
ASSERT3U(db->db_level, ==, 0);
|
|
|
|
if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
|
|
zbookmark_phys_t zb;
|
|
|
|
/*
|
|
* Unfortunately, there is currently no mechanism for
|
|
* syncing context to handle decryption errors. An error
|
|
* here is only possible if an attacker maliciously
|
|
* changed a dnode block and updated the associated
|
|
* checksums going up the block tree.
|
|
*/
|
|
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
err = arc_untransform(db->db_buf, db->db_objset->os_spa,
|
|
&zb, B_TRUE);
|
|
if (err)
|
|
panic("Invalid dnode block MAC");
|
|
} else if (dr->dt.dl.dr_has_raw_params) {
|
|
(void) arc_release(dr->dt.dl.dr_data, db);
|
|
arc_convert_to_raw(dr->dt.dl.dr_data,
|
|
dmu_objset_id(db->db_objset),
|
|
dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
|
|
dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
|
|
* is critical the we not allow the compiler to inline this function in to
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
*/
|
|
noinline static void
|
|
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
dnode_t *dn = dr->dr_dnode;
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
ASSERT(db->db_level > 0);
|
|
DBUF_VERIFY(db);
|
|
|
|
/* Read the block if it hasn't been read yet. */
|
|
if (db->db_buf == NULL) {
|
|
mutex_exit(&db->db_mtx);
|
|
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
|
|
mutex_enter(&db->db_mtx);
|
|
}
|
|
ASSERT3U(db->db_state, ==, DB_CACHED);
|
|
ASSERT(db->db_buf != NULL);
|
|
|
|
/* Indirect block size must match what the dnode thinks it is. */
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
|
dbuf_check_blkptr(dn, db);
|
|
|
|
/* Provide the pending dirty record to child dbufs */
|
|
db->db_data_pending = dr;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_write(dr, db->db_buf, tx);
|
|
|
|
zio_t *zio = dr->dr_zio;
|
|
mutex_enter(&dr->dt.di.dr_mtx);
|
|
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
|
mutex_exit(&dr->dt.di.dr_mtx);
|
|
zio_nowait(zio);
|
|
}
|
|
|
|
/*
|
|
* Verify that the size of the data in our bonus buffer does not exceed
|
|
* its recorded size.
|
|
*
|
|
* The purpose of this verification is to catch any cases in development
|
|
* where the size of a phys structure (i.e space_map_phys_t) grows and,
|
|
* due to incorrect feature management, older pools expect to read more
|
|
* data even though they didn't actually write it to begin with.
|
|
*
|
|
* For a example, this would catch an error in the feature logic where we
|
|
* open an older pool and we expect to write the space map histogram of
|
|
* a space map with size SPACE_MAP_SIZE_V0.
|
|
*/
|
|
static void
|
|
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
|
|
{
|
|
#ifdef ZFS_DEBUG
|
|
dnode_t *dn = dr->dr_dnode;
|
|
|
|
/*
|
|
* Encrypted bonus buffers can have data past their bonuslen.
|
|
* Skip the verification of these blocks.
|
|
*/
|
|
if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
|
|
return;
|
|
|
|
uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
|
|
uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
|
ASSERT3U(bonuslen, <=, maxbonuslen);
|
|
|
|
arc_buf_t *datap = dr->dt.dl.dr_data;
|
|
char *datap_end = ((char *)datap) + bonuslen;
|
|
char *datap_max = ((char *)datap) + maxbonuslen;
|
|
|
|
/* ensure that everything is zero after our data */
|
|
for (; datap_end < datap_max; datap_end++)
|
|
ASSERT(*datap_end == 0);
|
|
#endif
|
|
}
|
|
|
|
static blkptr_t *
|
|
dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
|
|
{
|
|
/* This must be a lightweight dirty record. */
|
|
ASSERT3P(dr->dr_dbuf, ==, NULL);
|
|
dnode_t *dn = dr->dr_dnode;
|
|
|
|
if (dn->dn_phys->dn_nlevels == 1) {
|
|
VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
|
|
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
|
|
} else {
|
|
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
|
|
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
VERIFY3U(parent_db->db_level, ==, 1);
|
|
VERIFY3P(DB_DNODE(parent_db), ==, dn);
|
|
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
|
|
blkptr_t *bp = parent_db->db.db_data;
|
|
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_lightweight_ready(zio_t *zio)
|
|
{
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
blkptr_t *bp = zio->io_bp;
|
|
|
|
if (zio->io_error != 0)
|
|
return;
|
|
|
|
dnode_t *dn = dr->dr_dnode;
|
|
|
|
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
|
|
spa_t *spa = dmu_objset_spa(dn->dn_objset);
|
|
int64_t delta = bp_get_dsize_sync(spa, bp) -
|
|
bp_get_dsize_sync(spa, bp_orig);
|
|
dnode_diduse_space(dn, delta);
|
|
|
|
uint64_t blkid = dr->dt.dll.dr_blkid;
|
|
mutex_enter(&dn->dn_mtx);
|
|
if (blkid > dn->dn_phys->dn_maxblkid) {
|
|
ASSERT0(dn->dn_objset->os_raw_receive);
|
|
dn->dn_phys->dn_maxblkid = blkid;
|
|
}
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
if (!BP_IS_EMBEDDED(bp)) {
|
|
uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
|
|
BP_SET_FILL(bp, fill);
|
|
}
|
|
|
|
dmu_buf_impl_t *parent_db;
|
|
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
|
|
if (dr->dr_parent == NULL) {
|
|
parent_db = dn->dn_dbuf;
|
|
} else {
|
|
parent_db = dr->dr_parent->dr_dbuf;
|
|
}
|
|
rw_enter(&parent_db->db_rwlock, RW_WRITER);
|
|
*bp_orig = *bp;
|
|
rw_exit(&parent_db->db_rwlock);
|
|
}
|
|
|
|
static void
|
|
dbuf_lightweight_done(zio_t *zio)
|
|
{
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
|
|
VERIFY0(zio->io_error);
|
|
|
|
objset_t *os = dr->dr_dnode->dn_objset;
|
|
dmu_tx_t *tx = os->os_synctx;
|
|
|
|
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
|
|
ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
|
|
} else {
|
|
dsl_dataset_t *ds = os->os_dsl_dataset;
|
|
(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
|
|
dsl_dataset_block_born(ds, zio->io_bp, tx);
|
|
}
|
|
|
|
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
|
|
zio->io_txg);
|
|
|
|
abd_free(dr->dt.dll.dr_abd);
|
|
kmem_free(dr, sizeof (*dr));
|
|
}
|
|
|
|
noinline static void
|
|
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
{
|
|
dnode_t *dn = dr->dr_dnode;
|
|
zio_t *pio;
|
|
if (dn->dn_phys->dn_nlevels == 1) {
|
|
pio = dn->dn_zio;
|
|
} else {
|
|
pio = dr->dr_parent->dr_zio;
|
|
}
|
|
|
|
zbookmark_phys_t zb = {
|
|
.zb_objset = dmu_objset_id(dn->dn_objset),
|
|
.zb_object = dn->dn_object,
|
|
.zb_level = 0,
|
|
.zb_blkid = dr->dt.dll.dr_blkid,
|
|
};
|
|
|
|
/*
|
|
* See comment in dbuf_write(). This is so that zio->io_bp_orig
|
|
* will have the old BP in dbuf_lightweight_done().
|
|
*/
|
|
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
|
|
|
|
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
|
|
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
|
|
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
|
|
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
|
|
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
|
|
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
|
|
|
|
zio_nowait(dr->dr_zio);
|
|
}
|
|
|
|
/*
|
|
* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
|
|
* critical the we not allow the compiler to inline this function in to
|
|
* dbuf_sync_list() thereby drastically bloating the stack usage.
|
|
*/
|
|
noinline static void
|
|
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|
{
|
|
arc_buf_t **datap = &dr->dt.dl.dr_data;
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
dnode_t *dn = dr->dr_dnode;
|
|
objset_t *os;
|
|
uint64_t txg = tx->tx_txg;
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
/*
|
|
* To be synced, we must be dirtied. But we might have been freed
|
|
* after the dirty.
|
|
*/
|
|
if (db->db_state == DB_UNCACHED) {
|
|
/* This buffer has been freed since it was dirtied */
|
|
ASSERT3P(db->db.db_data, ==, NULL);
|
|
} else if (db->db_state == DB_FILL) {
|
|
/* This buffer was freed and is now being re-filled */
|
|
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
|
|
} else if (db->db_state == DB_READ) {
|
|
/*
|
|
* This buffer was either cloned or had a Direct I/O write
|
|
* occur and has an in-flgiht read on the BP. It is safe to
|
|
* issue the write here, because the read has already been
|
|
* issued and the contents won't change.
|
|
*
|
|
* We can verify the case of both the clone and Direct I/O
|
|
* write by making sure the first dirty record for the dbuf
|
|
* has no ARC buffer associated with it.
|
|
*/
|
|
dbuf_dirty_record_t *dr_head =
|
|
list_head(&db->db_dirty_records);
|
|
ASSERT3P(db->db_buf, ==, NULL);
|
|
ASSERT3P(db->db.db_data, ==, NULL);
|
|
ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
|
|
ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
|
|
} else {
|
|
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
|
|
}
|
|
DBUF_VERIFY(db);
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
mutex_enter(&dn->dn_mtx);
|
|
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
|
|
/*
|
|
* In the previous transaction group, the bonus buffer
|
|
* was entirely used to store the attributes for the
|
|
* dnode which overrode the dn_spill field. However,
|
|
* when adding more attributes to the file a spill
|
|
* block was required to hold the extra attributes.
|
|
*
|
|
* Make sure to clear the garbage left in the dn_spill
|
|
* field from the previous attributes in the bonus
|
|
* buffer. Otherwise, after writing out the spill
|
|
* block to the new allocated dva, it will free
|
|
* the old block pointed to by the invalid dn_spill.
|
|
*/
|
|
db->db_blkptr = NULL;
|
|
}
|
|
dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
|
|
mutex_exit(&dn->dn_mtx);
|
|
}
|
|
|
|
/*
|
|
* If this is a bonus buffer, simply copy the bonus data into the
|
|
* dnode. It will be written out when the dnode is synced (and it
|
|
* will be synced, since it must have been dirty for dbuf_sync to
|
|
* be called).
|
|
*/
|
|
if (db->db_blkid == DMU_BONUS_BLKID) {
|
|
ASSERT(dr->dr_dbuf == db);
|
|
dbuf_sync_bonus(dr, tx);
|
|
return;
|
|
}
|
|
|
|
os = dn->dn_objset;
|
|
|
|
/*
|
|
* This function may have dropped the db_mtx lock allowing a dmu_sync
|
|
* operation to sneak in. As a result, we need to ensure that we
|
|
* don't check the dr_override_state until we have returned from
|
|
* dbuf_check_blkptr.
|
|
*/
|
|
dbuf_check_blkptr(dn, db);
|
|
|
|
/*
|
|
* If this buffer is in the middle of an immediate write, wait for the
|
|
* synchronous IO to complete.
|
|
*
|
|
* This is also valid even with Direct I/O writes setting a dirty
|
|
* records override state into DR_IN_DMU_SYNC, because all
|
|
* Direct I/O writes happen in open-context.
|
|
*/
|
|
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
|
|
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
|
|
cv_wait(&db->db_changed, &db->db_mtx);
|
|
}
|
|
|
|
/*
|
|
* If this is a dnode block, ensure it is appropriately encrypted
|
|
* or decrypted, depending on what we are writing to it this txg.
|
|
*/
|
|
if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
|
|
dbuf_prepare_encrypted_dnode_leaf(dr);
|
|
|
|
if (*datap != NULL && *datap == db->db_buf &&
|
|
dn->dn_object != DMU_META_DNODE_OBJECT &&
|
|
zfs_refcount_count(&db->db_holds) > 1 &&
|
|
dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
|
|
/*
|
|
* If this buffer is currently "in use" (i.e., there
|
|
* are active holds and db_data still references it),
|
|
* then make a copy before we start the write so that
|
|
* any modifications from the open txg will not leak
|
|
* into this write.
|
|
*
|
|
* NOTE: this copy does not need to be made for
|
|
* objects only modified in the syncing context (e.g.
|
|
* DNONE_DNODE blocks).
|
|
*/
|
|
int psize = arc_buf_size(*datap);
|
|
int lsize = arc_buf_lsize(*datap);
|
|
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
|
|
enum zio_compress compress_type = arc_get_compression(*datap);
|
|
uint8_t complevel = arc_get_complevel(*datap);
|
|
|
|
if (arc_is_encrypted(*datap)) {
|
|
boolean_t byteorder;
|
|
uint8_t salt[ZIO_DATA_SALT_LEN];
|
|
uint8_t iv[ZIO_DATA_IV_LEN];
|
|
uint8_t mac[ZIO_DATA_MAC_LEN];
|
|
|
|
arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
|
|
*datap = arc_alloc_raw_buf(os->os_spa, db,
|
|
dmu_objset_id(os), byteorder, salt, iv, mac,
|
|
dn->dn_type, psize, lsize, compress_type,
|
|
complevel);
|
|
} else if (compress_type != ZIO_COMPRESS_OFF) {
|
|
ASSERT3U(type, ==, ARC_BUFC_DATA);
|
|
*datap = arc_alloc_compressed_buf(os->os_spa, db,
|
|
psize, lsize, compress_type, complevel);
|
|
} else {
|
|
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
|
|
}
|
|
memcpy((*datap)->b_data, db->db.db_data, psize);
|
|
}
|
|
db->db_data_pending = dr;
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_write(dr, *datap, tx);
|
|
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
|
|
list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
|
|
} else {
|
|
zio_nowait(dr->dr_zio);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Syncs out a range of dirty records for indirect or leaf dbufs. May be
|
|
* called recursively from dbuf_sync_indirect().
|
|
*/
|
|
void
|
|
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
|
|
{
|
|
dbuf_dirty_record_t *dr;
|
|
|
|
while ((dr = list_head(list))) {
|
|
if (dr->dr_zio != NULL) {
|
|
/*
|
|
* If we find an already initialized zio then we
|
|
* are processing the meta-dnode, and we have finished.
|
|
* The dbufs for all dnodes are put back on the list
|
|
* during processing, so that we can zio_wait()
|
|
* these IOs after initiating all child IOs.
|
|
*/
|
|
ASSERT3U(dr->dr_dbuf->db.db_object, ==,
|
|
DMU_META_DNODE_OBJECT);
|
|
break;
|
|
}
|
|
list_remove(list, dr);
|
|
if (dr->dr_dbuf == NULL) {
|
|
dbuf_sync_lightweight(dr, tx);
|
|
} else {
|
|
if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
|
|
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
|
|
VERIFY3U(dr->dr_dbuf->db_level, ==, level);
|
|
}
|
|
if (dr->dr_dbuf->db_level > 0)
|
|
dbuf_sync_indirect(dr, tx);
|
|
else
|
|
dbuf_sync_leaf(dr, tx);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
{
|
|
(void) buf;
|
|
dmu_buf_impl_t *db = vdb;
|
|
dnode_t *dn;
|
|
blkptr_t *bp = zio->io_bp;
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
|
spa_t *spa = zio->io_spa;
|
|
int64_t delta;
|
|
uint64_t fill = 0;
|
|
int i;
|
|
|
|
ASSERT3P(db->db_blkptr, !=, NULL);
|
|
ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
dn = DB_DNODE(db);
|
|
delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
|
|
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
|
|
zio->io_prev_space_delta = delta;
|
|
|
|
if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
|
|
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
|
|
BP_GET_TYPE(bp) == dn->dn_type) ||
|
|
(db->db_blkid == DMU_SPILL_BLKID &&
|
|
BP_GET_TYPE(bp) == dn->dn_bonustype) ||
|
|
BP_IS_EMBEDDED(bp));
|
|
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
|
|
}
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
#ifdef ZFS_DEBUG
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
|
ASSERT(!(BP_IS_HOLE(bp)) &&
|
|
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
|
|
}
|
|
#endif
|
|
|
|
if (db->db_level == 0) {
|
|
mutex_enter(&dn->dn_mtx);
|
|
if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
|
|
db->db_blkid != DMU_SPILL_BLKID) {
|
|
ASSERT0(db->db_objset->os_raw_receive);
|
|
dn->dn_phys->dn_maxblkid = db->db_blkid;
|
|
}
|
|
mutex_exit(&dn->dn_mtx);
|
|
|
|
if (dn->dn_type == DMU_OT_DNODE) {
|
|
i = 0;
|
|
while (i < db->db.db_size) {
|
|
dnode_phys_t *dnp =
|
|
(void *)(((char *)db->db.db_data) + i);
|
|
|
|
i += DNODE_MIN_SIZE;
|
|
if (dnp->dn_type != DMU_OT_NONE) {
|
|
fill++;
|
|
for (int j = 0; j < dnp->dn_nblkptr;
|
|
j++) {
|
|
(void) zfs_blkptr_verify(spa,
|
|
&dnp->dn_blkptr[j],
|
|
BLK_CONFIG_SKIP,
|
|
BLK_VERIFY_HALT);
|
|
}
|
|
if (dnp->dn_flags &
|
|
DNODE_FLAG_SPILL_BLKPTR) {
|
|
(void) zfs_blkptr_verify(spa,
|
|
DN_SPILL_BLKPTR(dnp),
|
|
BLK_CONFIG_SKIP,
|
|
BLK_VERIFY_HALT);
|
|
}
|
|
i += dnp->dn_extra_slots *
|
|
DNODE_MIN_SIZE;
|
|
}
|
|
}
|
|
} else {
|
|
if (BP_IS_HOLE(bp)) {
|
|
fill = 0;
|
|
} else {
|
|
fill = 1;
|
|
}
|
|
}
|
|
} else {
|
|
blkptr_t *ibp = db->db.db_data;
|
|
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
|
|
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
|
|
if (BP_IS_HOLE(ibp))
|
|
continue;
|
|
(void) zfs_blkptr_verify(spa, ibp,
|
|
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
|
|
fill += BP_GET_FILL(ibp);
|
|
}
|
|
}
|
|
DB_DNODE_EXIT(db);
|
|
|
|
if (!BP_IS_EMBEDDED(bp))
|
|
BP_SET_FILL(bp, fill);
|
|
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
|
|
*db->db_blkptr = *bp;
|
|
dmu_buf_unlock_parent(db, dblt, FTAG);
|
|
}
|
|
|
|
/*
|
|
* This function gets called just prior to running through the compression
|
|
* stage of the zio pipeline. If we're an indirect block comprised of only
|
|
* holes, then we want this indirect to be compressed away to a hole. In
|
|
* order to do that we must zero out any information about the holes that
|
|
* this indirect points to prior to before we try to compress it.
|
|
*/
|
|
static void
|
|
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
{
|
|
(void) zio, (void) buf;
|
|
dmu_buf_impl_t *db = vdb;
|
|
blkptr_t *bp;
|
|
unsigned int epbs, i;
|
|
|
|
ASSERT3U(db->db_level, >, 0);
|
|
DB_DNODE_ENTER(db);
|
|
epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
DB_DNODE_EXIT(db);
|
|
ASSERT3U(epbs, <, 31);
|
|
|
|
/* Determine if all our children are holes */
|
|
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
|
|
if (!BP_IS_HOLE(bp))
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* If all the children are holes, then zero them all out so that
|
|
* we may get compressed away.
|
|
*/
|
|
if (i == 1ULL << epbs) {
|
|
/*
|
|
* We only found holes. Grab the rwlock to prevent
|
|
* anybody from reading the blocks we're about to
|
|
* zero out.
|
|
*/
|
|
rw_enter(&db->db_rwlock, RW_WRITER);
|
|
memset(db->db.db_data, 0, db->db.db_size);
|
|
rw_exit(&db->db_rwlock);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|
{
|
|
(void) buf;
|
|
dmu_buf_impl_t *db = vdb;
|
|
blkptr_t *bp_orig = &zio->io_bp_orig;
|
|
blkptr_t *bp = db->db_blkptr;
|
|
objset_t *os = db->db_objset;
|
|
dmu_tx_t *tx = os->os_synctx;
|
|
|
|
ASSERT0(zio->io_error);
|
|
ASSERT(db->db_blkptr == bp);
|
|
|
|
/*
|
|
* For nopwrites and rewrites we ensure that the bp matches our
|
|
* original and bypass all the accounting.
|
|
*/
|
|
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
|
|
ASSERT(BP_EQUAL(bp, bp_orig));
|
|
} else {
|
|
dsl_dataset_t *ds = os->os_dsl_dataset;
|
|
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
|
|
dsl_dataset_block_born(ds, bp, tx);
|
|
}
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
|
|
DBUF_VERIFY(db);
|
|
|
|
dbuf_dirty_record_t *dr = db->db_data_pending;
|
|
dnode_t *dn = dr->dr_dnode;
|
|
ASSERT(!list_link_active(&dr->dr_dirty_node));
|
|
ASSERT(dr->dr_dbuf == db);
|
|
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
|
|
list_remove(&db->db_dirty_records, dr);
|
|
|
|
#ifdef ZFS_DEBUG
|
|
if (db->db_blkid == DMU_SPILL_BLKID) {
|
|
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
|
|
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
|
|
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
|
|
}
|
|
#endif
|
|
|
|
if (db->db_level == 0) {
|
|
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
|
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
|
|
|
|
/* no dr_data if this is a NO_FILL or Direct I/O */
|
|
if (dr->dt.dl.dr_data != NULL &&
|
|
dr->dt.dl.dr_data != db->db_buf) {
|
|
ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
|
|
ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
|
|
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
|
}
|
|
} else {
|
|
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
|
|
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
|
|
if (!BP_IS_HOLE(db->db_blkptr)) {
|
|
int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
|
|
SPA_BLKPTRSHIFT;
|
|
ASSERT3U(db->db_blkid, <=,
|
|
dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
|
|
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
|
|
db->db.db_size);
|
|
}
|
|
mutex_destroy(&dr->dt.di.dr_mtx);
|
|
list_destroy(&dr->dt.di.dr_children);
|
|
}
|
|
|
|
cv_broadcast(&db->db_changed);
|
|
ASSERT(db->db_dirtycnt > 0);
|
|
db->db_dirtycnt -= 1;
|
|
db->db_data_pending = NULL;
|
|
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
|
|
|
|
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
|
|
zio->io_txg);
|
|
|
|
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
|
}
|
|
|
|
static void
|
|
dbuf_write_nofill_ready(zio_t *zio)
|
|
{
|
|
dbuf_write_ready(zio, NULL, zio->io_private);
|
|
}
|
|
|
|
static void
|
|
dbuf_write_nofill_done(zio_t *zio)
|
|
{
|
|
dbuf_write_done(zio, NULL, zio->io_private);
|
|
}
|
|
|
|
static void
|
|
dbuf_write_override_ready(zio_t *zio)
|
|
{
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
|
|
dbuf_write_ready(zio, NULL, db);
|
|
}
|
|
|
|
static void
|
|
dbuf_write_override_done(zio_t *zio)
|
|
{
|
|
dbuf_dirty_record_t *dr = zio->io_private;
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
|
|
|
|
mutex_enter(&db->db_mtx);
|
|
if (!BP_EQUAL(zio->io_bp, obp)) {
|
|
if (!BP_IS_HOLE(obp))
|
|
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
|
|
arc_release(dr->dt.dl.dr_data, db);
|
|
}
|
|
mutex_exit(&db->db_mtx);
|
|
|
|
dbuf_write_done(zio, NULL, db);
|
|
|
|
if (zio->io_abd != NULL)
|
|
abd_free(zio->io_abd);
|
|
}
|
|
|
|
typedef struct dbuf_remap_impl_callback_arg {
|
|
objset_t *drica_os;
|
|
uint64_t drica_blk_birth;
|
|
dmu_tx_t *drica_tx;
|
|
} dbuf_remap_impl_callback_arg_t;
|
|
|
|
static void
|
|
dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
|
|
void *arg)
|
|
{
|
|
dbuf_remap_impl_callback_arg_t *drica = arg;
|
|
objset_t *os = drica->drica_os;
|
|
spa_t *spa = dmu_objset_spa(os);
|
|
dmu_tx_t *tx = drica->drica_tx;
|
|
|
|
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
|
|
|
|
if (os == spa_meta_objset(spa)) {
|
|
spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
|
|
} else {
|
|
dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
|
|
size, drica->drica_blk_birth, tx);
|
|
}
|
|
}
|
|
|
|
static void
|
|
dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
|
|
{
|
|
blkptr_t bp_copy = *bp;
|
|
spa_t *spa = dmu_objset_spa(dn->dn_objset);
|
|
dbuf_remap_impl_callback_arg_t drica;
|
|
|
|
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
|
|
|
|
drica.drica_os = dn->dn_objset;
|
|
drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
|
|
drica.drica_tx = tx;
|
|
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
|
|
&drica)) {
|
|
/*
|
|
* If the blkptr being remapped is tracked by a livelist,
|
|
* then we need to make sure the livelist reflects the update.
|
|
* First, cancel out the old blkptr by appending a 'FREE'
|
|
* entry. Next, add an 'ALLOC' to track the new version. This
|
|
* way we avoid trying to free an inaccurate blkptr at delete.
|
|
* Note that embedded blkptrs are not tracked in livelists.
|
|
*/
|
|
if (dn->dn_objset != spa_meta_objset(spa)) {
|
|
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
|
|
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
|
|
BP_GET_LOGICAL_BIRTH(bp) >
|
|
ds->ds_dir->dd_origin_txg) {
|
|
ASSERT(!BP_IS_EMBEDDED(bp));
|
|
ASSERT(dsl_dir_is_clone(ds->ds_dir));
|
|
ASSERT(spa_feature_is_enabled(spa,
|
|
SPA_FEATURE_LIVELIST));
|
|
bplist_append(&ds->ds_dir->dd_pending_frees,
|
|
bp);
|
|
bplist_append(&ds->ds_dir->dd_pending_allocs,
|
|
&bp_copy);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The db_rwlock prevents dbuf_read_impl() from
|
|
* dereferencing the BP while we are changing it. To
|
|
* avoid lock contention, only grab it when we are actually
|
|
* changing the BP.
|
|
*/
|
|
if (rw != NULL)
|
|
rw_enter(rw, RW_WRITER);
|
|
*bp = bp_copy;
|
|
if (rw != NULL)
|
|
rw_exit(rw);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Remap any existing BP's to concrete vdevs, if possible.
|
|
*/
|
|
static void
|
|
dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|
{
|
|
spa_t *spa = dmu_objset_spa(db->db_objset);
|
|
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
|
|
|
|
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
|
|
return;
|
|
|
|
if (db->db_level > 0) {
|
|
blkptr_t *bp = db->db.db_data;
|
|
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
|
|
dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
|
|
}
|
|
} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
|
|
dnode_phys_t *dnp = db->db.db_data;
|
|
ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE);
|
|
for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
|
|
i += dnp[i].dn_extra_slots + 1) {
|
|
for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
|
|
krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
|
|
&dn->dn_dbuf->db_rwlock);
|
|
dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
|
|
tx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
|
|
* Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
|
|
*/
|
|
static void
|
|
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|
{
|
|
dmu_buf_impl_t *db = dr->dr_dbuf;
|
|
dnode_t *dn = dr->dr_dnode;
|
|
objset_t *os;
|
|
dmu_buf_impl_t *parent = db->db_parent;
|
|
uint64_t txg = tx->tx_txg;
|
|
zbookmark_phys_t zb;
|
|
zio_prop_t zp;
|
|
zio_t *pio; /* parent I/O */
|
|
int wp_flag = 0;
|
|
|
|
ASSERT(dmu_tx_is_syncing(tx));
|
|
|
|
os = dn->dn_objset;
|
|
|
|
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
|
|
/*
|
|
* Private object buffers are released here rather than in
|
|
* dbuf_dirty() since they are only modified in the syncing
|
|
* context and we don't want the overhead of making multiple
|
|
* copies of the data.
|
|
*/
|
|
if (BP_IS_HOLE(db->db_blkptr))
|
|
arc_buf_thaw(data);
|
|
else
|
|
dbuf_release_bp(db);
|
|
dbuf_remap(dn, db, tx);
|
|
}
|
|
|
|
if (parent != dn->dn_dbuf) {
|
|
/* Our parent is an indirect block. */
|
|
/* We have a dirty parent that has been scheduled for write. */
|
|
ASSERT(parent && parent->db_data_pending);
|
|
/* Our parent's buffer is one level closer to the dnode. */
|
|
ASSERT(db->db_level == parent->db_level-1);
|
|
/*
|
|
* We're about to modify our parent's db_data by modifying
|
|
* our block pointer, so the parent must be released.
|
|
*/
|
|
ASSERT(arc_released(parent->db_buf));
|
|
pio = parent->db_data_pending->dr_zio;
|
|
} else {
|
|
/* Our parent is the dnode itself. */
|
|
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
|
|
db->db_blkid != DMU_SPILL_BLKID) ||
|
|
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
|
|
if (db->db_blkid != DMU_SPILL_BLKID)
|
|
ASSERT3P(db->db_blkptr, ==,
|
|
&dn->dn_phys->dn_blkptr[db->db_blkid]);
|
|
pio = dn->dn_zio;
|
|
}
|
|
|
|
ASSERT(db->db_level == 0 || data == db->db_buf);
|
|
ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
|
|
ASSERT(pio);
|
|
|
|
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
|
|
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
|
|
db->db.db_object, db->db_level, db->db_blkid);
|
|
|
|
if (db->db_blkid == DMU_SPILL_BLKID)
|
|
wp_flag = WP_SPILL;
|
|
wp_flag |= (data == NULL) ? WP_NOFILL : 0;
|
|
|
|
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
|
|
|
|
/*
|
|
* We copy the blkptr now (rather than when we instantiate the dirty
|
|
* record), because its value can change between open context and
|
|
* syncing context. We do not need to hold dn_struct_rwlock to read
|
|
* db_blkptr because we are in syncing context.
|
|
*/
|
|
dr->dr_bp_copy = *db->db_blkptr;
|
|
|
|
if (db->db_level == 0 &&
|
|
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
|
/*
|
|
* The BP for this block has been provided by open context
|
|
* (by dmu_sync(), dmu_write_direct(),
|
|
* or dmu_buf_write_embedded()).
|
|
*/
|
|
abd_t *contents = (data != NULL) ?
|
|
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
|
|
|
|
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
|
|
contents, db->db.db_size, db->db.db_size, &zp,
|
|
dbuf_write_override_ready, NULL,
|
|
dbuf_write_override_done,
|
|
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
|
mutex_enter(&db->db_mtx);
|
|
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
|
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
|
|
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
|
|
dr->dt.dl.dr_brtwrite);
|
|
mutex_exit(&db->db_mtx);
|
|
} else if (data == NULL) {
|
|
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
|
|
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
|
|
dr->dr_zio = zio_write(pio, os->os_spa, txg,
|
|
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
|
|
dbuf_write_nofill_ready, NULL,
|
|
dbuf_write_nofill_done, db,
|
|
ZIO_PRIORITY_ASYNC_WRITE,
|
|
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
|
|
} else {
|
|
ASSERT(arc_released(data));
|
|
|
|
/*
|
|
* For indirect blocks, we want to setup the children
|
|
* ready callback so that we can properly handle an indirect
|
|
* block that only contains holes.
|
|
*/
|
|
arc_write_done_func_t *children_ready_cb = NULL;
|
|
if (db->db_level != 0)
|
|
children_ready_cb = dbuf_write_children_ready;
|
|
|
|
dr->dr_zio = arc_write(pio, os->os_spa, txg,
|
|
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
|
|
dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
|
|
children_ready_cb, dbuf_write_done, db,
|
|
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
|
}
|
|
}
|
|
|
|
EXPORT_SYMBOL(dbuf_find);
|
|
EXPORT_SYMBOL(dbuf_is_metadata);
|
|
EXPORT_SYMBOL(dbuf_destroy);
|
|
EXPORT_SYMBOL(dbuf_loan_arcbuf);
|
|
EXPORT_SYMBOL(dbuf_whichblock);
|
|
EXPORT_SYMBOL(dbuf_read);
|
|
EXPORT_SYMBOL(dbuf_unoverride);
|
|
EXPORT_SYMBOL(dbuf_free_range);
|
|
EXPORT_SYMBOL(dbuf_new_size);
|
|
EXPORT_SYMBOL(dbuf_release_bp);
|
|
EXPORT_SYMBOL(dbuf_dirty);
|
|
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
|
|
EXPORT_SYMBOL(dmu_buf_will_dirty);
|
|
EXPORT_SYMBOL(dmu_buf_is_dirty);
|
|
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
|
|
EXPORT_SYMBOL(dmu_buf_will_not_fill);
|
|
EXPORT_SYMBOL(dmu_buf_will_fill);
|
|
EXPORT_SYMBOL(dmu_buf_fill_done);
|
|
EXPORT_SYMBOL(dmu_buf_rele);
|
|
EXPORT_SYMBOL(dbuf_assign_arcbuf);
|
|
EXPORT_SYMBOL(dbuf_prefetch);
|
|
EXPORT_SYMBOL(dbuf_hold_impl);
|
|
EXPORT_SYMBOL(dbuf_hold);
|
|
EXPORT_SYMBOL(dbuf_hold_level);
|
|
EXPORT_SYMBOL(dbuf_create_bonus);
|
|
EXPORT_SYMBOL(dbuf_spill_set_blksz);
|
|
EXPORT_SYMBOL(dbuf_rm_spill);
|
|
EXPORT_SYMBOL(dbuf_add_ref);
|
|
EXPORT_SYMBOL(dbuf_rele);
|
|
EXPORT_SYMBOL(dbuf_rele_and_unlock);
|
|
EXPORT_SYMBOL(dbuf_refcount);
|
|
EXPORT_SYMBOL(dbuf_sync_list);
|
|
EXPORT_SYMBOL(dmu_buf_set_user);
|
|
EXPORT_SYMBOL(dmu_buf_set_user_ie);
|
|
EXPORT_SYMBOL(dmu_buf_get_user);
|
|
EXPORT_SYMBOL(dmu_buf_get_blkptr);
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
|
|
"Maximum size in bytes of the dbuf cache.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
|
|
"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
|
|
"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
|
|
"Maximum size in bytes of dbuf metadata cache.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
|
|
"Set size of dbuf cache to log2 fraction of arc size.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
|
|
"Set size of dbuf metadata cache to log2 fraction of arc size.");
|
|
|
|
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
|
|
"Set size of dbuf cache mutex array as log2 shift.");
|