mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-23 02:44:41 +03:00
Sequential scrub and resilvers
Currently, scrubs and resilvers can take an extremely long time to complete. This is largely due to the fact that zfs scans process pools in logical order, as determined by each block's bookmark. This makes sense from a simplicity perspective, but blocks in zfs are often scattered randomly across disks, particularly due to zfs's copy-on-write mechanisms. This patch improves performance by splitting scrubs and resilvers into a metadata scanning phase and an IO issuing phase. The metadata scan reads through the structure of the pool and gathers an in-memory queue of I/Os, sorted by size and offset on disk. The issuing phase will then issue the scrub I/Os as sequentially as possible, greatly improving performance. This patch also updates and cleans up some of the scan code which has not been updated in several years. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Authored-by: Saso Kiselkov <saso.kiselkov@nexenta.com> Authored-by: Alek Pinchuk <apinchuk@datto.com> Authored-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #3625 Closes #6256
This commit is contained in:
committed by
Brian Behlendorf
parent
e301113c17
commit
d4a72f2386
+84
-39
@@ -357,7 +357,8 @@ int arc_no_grow_shift = 5;
|
||||
* minimum lifespan of a prefetch block in clock ticks
|
||||
* (initialized in arc_init())
|
||||
*/
|
||||
static int arc_min_prefetch_lifespan;
|
||||
static int arc_min_prefetch_ms;
|
||||
static int arc_min_prescient_prefetch_ms;
|
||||
|
||||
/*
|
||||
* If this percent of memory is free, don't throttle.
|
||||
@@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
|
||||
* These tunables are Linux specific
|
||||
*/
|
||||
unsigned long zfs_arc_sys_free = 0;
|
||||
int zfs_arc_min_prefetch_lifespan = 0;
|
||||
int zfs_arc_min_prefetch_ms = 0;
|
||||
int zfs_arc_min_prescient_prefetch_ms = 0;
|
||||
int zfs_arc_p_aggressive_disable = 1;
|
||||
int zfs_arc_p_dampener_disable = 1;
|
||||
int zfs_arc_meta_prune = 10000;
|
||||
@@ -663,6 +665,7 @@ typedef struct arc_stats {
|
||||
kstat_named_t arcstat_meta_min;
|
||||
kstat_named_t arcstat_sync_wait_for_async;
|
||||
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
||||
kstat_named_t arcstat_demand_hit_prescient_prefetch;
|
||||
kstat_named_t arcstat_need_free;
|
||||
kstat_named_t arcstat_sys_free;
|
||||
kstat_named_t arcstat_raw_size;
|
||||
@@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
|
||||
{ "arc_meta_min", KSTAT_DATA_UINT64 },
|
||||
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
|
||||
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
|
||||
{ "arc_need_free", KSTAT_DATA_UINT64 },
|
||||
{ "arc_sys_free", KSTAT_DATA_UINT64 },
|
||||
{ "arc_raw_size", KSTAT_DATA_UINT64 }
|
||||
@@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
|
||||
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
|
||||
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
|
||||
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
|
||||
#define HDR_PRESCIENT_PREFETCH(hdr) \
|
||||
((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
#define HDR_COMPRESSION_ENABLED(hdr) \
|
||||
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
|
||||
|
||||
@@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
{
|
||||
arc_state_t *evicted_state, *state;
|
||||
int64_t bytes_evicted = 0;
|
||||
int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
|
||||
arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
|
||||
|
||||
ASSERT(MUTEX_HELD(hash_lock));
|
||||
ASSERT(HDR_HAS_L1HDR(hdr));
|
||||
@@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
/* prefetch buffers have a minimum lifespan */
|
||||
if (HDR_IO_IN_PROGRESS(hdr) ||
|
||||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
|
||||
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
|
||||
arc_min_prefetch_lifespan)) {
|
||||
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
|
||||
ARCSTAT_BUMP(arcstat_evict_skip);
|
||||
return (bytes_evicted);
|
||||
}
|
||||
@@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* - move the buffer to the head of the list if this is
|
||||
* another prefetch (to make it less likely to be evicted).
|
||||
*/
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
||||
/* link protected by hash lock */
|
||||
ASSERT(multilist_link_active(
|
||||
&hdr->b_l1hdr.b_arc_node));
|
||||
} else {
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
|
||||
ARCSTAT_BUMP(arcstat_mru_hits);
|
||||
}
|
||||
@@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* MFU state.
|
||||
*/
|
||||
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
new_state = arc_mru;
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
}
|
||||
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
|
||||
} else {
|
||||
new_state = arc_mfu;
|
||||
@@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* If it was a prefetch, we will explicitly move it to
|
||||
* the head of the list now.
|
||||
*/
|
||||
if ((HDR_PREFETCH(hdr)) != 0) {
|
||||
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
|
||||
/* link protected by hash_lock */
|
||||
ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
|
||||
}
|
||||
|
||||
atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
|
||||
ARCSTAT_BUMP(arcstat_mfu_hits);
|
||||
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
|
||||
@@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
* MFU state.
|
||||
*/
|
||||
|
||||
if (HDR_PREFETCH(hdr)) {
|
||||
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
|
||||
/*
|
||||
* This is a prefetch access...
|
||||
* move this block back to the MRU state.
|
||||
*/
|
||||
ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
|
||||
new_state = arc_mru;
|
||||
}
|
||||
|
||||
@@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
|
||||
/* a generic arc_read_done_func_t which you can use */
|
||||
/* ARGSUSED */
|
||||
void
|
||||
arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
|
||||
arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *arg)
|
||||
{
|
||||
if (error == 0)
|
||||
bcopy(buf->b_data, arg, arc_buf_size(buf));
|
||||
if (buf == NULL)
|
||||
return;
|
||||
|
||||
bcopy(buf->b_data, arg, arc_buf_size(buf));
|
||||
arc_buf_destroy(buf, arg);
|
||||
}
|
||||
|
||||
/* a generic arc_read_done_func_t */
|
||||
/* ARGSUSED */
|
||||
void
|
||||
arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
|
||||
arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *arg)
|
||||
{
|
||||
arc_buf_t **bufp = arg;
|
||||
if (error != 0) {
|
||||
arc_buf_destroy(buf, arg);
|
||||
|
||||
if (buf == NULL) {
|
||||
*bufp = NULL;
|
||||
} else {
|
||||
*bufp = buf;
|
||||
@@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
|
||||
arc_callback_t *callback_list;
|
||||
arc_callback_t *acb;
|
||||
boolean_t freeable = B_FALSE;
|
||||
boolean_t no_zio_error = (zio->io_error == 0);
|
||||
|
||||
/*
|
||||
* The hdr was inserted into hash-table and removed from lists
|
||||
@@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
|
||||
}
|
||||
}
|
||||
|
||||
if (no_zio_error) {
|
||||
if (zio->io_error == 0) {
|
||||
/* byteswap if necessary */
|
||||
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
|
||||
if (BP_GET_LEVEL(zio->io_bp) > 0) {
|
||||
@@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
|
||||
callback_list = hdr->b_l1hdr.b_acb;
|
||||
ASSERT3P(callback_list, !=, NULL);
|
||||
|
||||
if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
|
||||
if (hash_lock && zio->io_error == 0 &&
|
||||
hdr->b_l1hdr.b_state == arc_anon) {
|
||||
/*
|
||||
* Only call arc_access on anonymous buffers. This is because
|
||||
* if we've issued an I/O for an evicted buffer, we've already
|
||||
@@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
|
||||
if (!acb->acb_done)
|
||||
continue;
|
||||
|
||||
/* This is a demand read since prefetches don't use callbacks */
|
||||
callback_cnt++;
|
||||
|
||||
if (zio->io_error != 0)
|
||||
continue;
|
||||
|
||||
int error = arc_buf_alloc_impl(hdr, zio->io_spa,
|
||||
acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
|
||||
acb->acb_compressed, acb->acb_noauth, no_zio_error,
|
||||
acb->acb_compressed, acb->acb_noauth, B_TRUE,
|
||||
&acb->acb_buf);
|
||||
if (error != 0) {
|
||||
arc_buf_destroy(acb->acb_buf, acb->acb_private);
|
||||
acb->acb_buf = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assert non-speculative zios didn't fail because an
|
||||
@@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio)
|
||||
}
|
||||
}
|
||||
|
||||
if (no_zio_error) {
|
||||
if (zio->io_error == 0)
|
||||
zio->io_error = error;
|
||||
}
|
||||
}
|
||||
hdr->b_l1hdr.b_acb = NULL;
|
||||
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
|
||||
@@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
|
||||
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
|
||||
callback_list != NULL);
|
||||
|
||||
if (no_zio_error) {
|
||||
if (zio->io_error == 0) {
|
||||
arc_hdr_verify(hdr, zio->io_bp);
|
||||
} else {
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
|
||||
@@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
|
||||
/* execute each callback and free its structure */
|
||||
while ((acb = callback_list) != NULL) {
|
||||
if (acb->acb_done) {
|
||||
acb->acb_done(zio, zio->io_error, acb->acb_buf,
|
||||
acb->acb_private);
|
||||
acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
|
||||
acb->acb_buf, acb->acb_private);
|
||||
}
|
||||
|
||||
if (acb->acb_zio_dummy != NULL) {
|
||||
@@ -5974,12 +5991,25 @@ top:
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PREDICTIVE_PREFETCH);
|
||||
}
|
||||
|
||||
if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
|
||||
ARCSTAT_BUMP(
|
||||
arcstat_demand_hit_prescient_prefetch);
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
}
|
||||
|
||||
ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
|
||||
|
||||
/* Get a buf with the desired data in it. */
|
||||
rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
|
||||
private, encrypted_read, compressed_read,
|
||||
noauth_read, B_TRUE, &buf);
|
||||
if (rc != 0) {
|
||||
arc_buf_destroy(buf, private);
|
||||
buf = NULL;
|
||||
}
|
||||
|
||||
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
|
||||
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
|
||||
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
|
||||
@@ -5987,6 +6017,8 @@ top:
|
||||
}
|
||||
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
|
||||
arc_access(hdr, hash_lock);
|
||||
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
|
||||
mutex_exit(hash_lock);
|
||||
@@ -5996,7 +6028,7 @@ top:
|
||||
data, metadata, hits);
|
||||
|
||||
if (done)
|
||||
done(NULL, rc, buf, private);
|
||||
done(NULL, zb, bp, buf, private);
|
||||
} else {
|
||||
uint64_t lsize = BP_GET_LSIZE(bp);
|
||||
uint64_t psize = BP_GET_PSIZE(bp);
|
||||
@@ -6112,6 +6144,8 @@ top:
|
||||
if (*arc_flags & ARC_FLAG_PREFETCH &&
|
||||
refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
|
||||
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
|
||||
if (*arc_flags & ARC_FLAG_L2CACHE)
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
|
||||
if (BP_IS_AUTHENTICATED(bp))
|
||||
@@ -7223,9 +7257,15 @@ arc_tuning_update(void)
|
||||
if (zfs_arc_p_min_shift)
|
||||
arc_p_min_shift = zfs_arc_p_min_shift;
|
||||
|
||||
/* Valid range: 1 - N ticks */
|
||||
if (zfs_arc_min_prefetch_lifespan)
|
||||
arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
|
||||
/* Valid range: 1 - N ms */
|
||||
if (zfs_arc_min_prefetch_ms)
|
||||
arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
|
||||
|
||||
/* Valid range: 1 - N ms */
|
||||
if (zfs_arc_min_prescient_prefetch_ms) {
|
||||
arc_min_prescient_prefetch_ms =
|
||||
zfs_arc_min_prescient_prefetch_ms;
|
||||
}
|
||||
|
||||
/* Valid range: 0 - 100 */
|
||||
if ((zfs_arc_lotsfree_percent >= 0) &&
|
||||
@@ -7368,7 +7408,8 @@ arc_init(void)
|
||||
cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
|
||||
|
||||
/* Convert seconds to clock ticks */
|
||||
arc_min_prefetch_lifespan = 1 * hz;
|
||||
arc_min_prefetch_ms = 1;
|
||||
arc_min_prescient_prefetch_ms = 6;
|
||||
|
||||
#ifdef _KERNEL
|
||||
/*
|
||||
@@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
|
||||
module_param(zfs_compressed_arc_enabled, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
|
||||
|
||||
module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
|
||||
module_param(zfs_arc_min_prefetch_ms, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
|
||||
|
||||
module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
|
||||
MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
|
||||
"Min life of prescient prefetched block in ms");
|
||||
|
||||
module_param(l2arc_write_max, ulong, 0644);
|
||||
MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
|
||||
|
||||
+16
-6
@@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
|
||||
}
|
||||
|
||||
static void
|
||||
dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
|
||||
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
arc_buf_t *buf, void *vdb)
|
||||
{
|
||||
dmu_buf_impl_t *db = vdb;
|
||||
|
||||
@@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
if (db->db_level == 0 && db->db_freed_in_flight) {
|
||||
/* we were freed in flight; disregard any error */
|
||||
if (buf == NULL) {
|
||||
buf = arc_alloc_buf(db->db_objset->os_spa,
|
||||
db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
|
||||
}
|
||||
arc_release(buf, db);
|
||||
bzero(buf->b_data, db->db.db_size);
|
||||
arc_buf_freeze(buf);
|
||||
db->db_freed_in_flight = FALSE;
|
||||
dbuf_set_data(db, buf);
|
||||
db->db_state = DB_CACHED;
|
||||
} else if (err == 0) {
|
||||
} else if (buf != NULL) {
|
||||
dbuf_set_data(db, buf);
|
||||
db->db_state = DB_CACHED;
|
||||
} else {
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
arc_buf_destroy(buf, db);
|
||||
db->db_state = DB_UNCACHED;
|
||||
}
|
||||
cv_broadcast(&db->db_changed);
|
||||
@@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
|
||||
* prefetch if the next block down is our target.
|
||||
*/
|
||||
static void
|
||||
dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
|
||||
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
|
||||
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
|
||||
{
|
||||
dbuf_prefetch_arg_t *dpa = private;
|
||||
|
||||
@@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
|
||||
dbuf_rele(db, FTAG);
|
||||
}
|
||||
|
||||
dpa->dpa_curlevel--;
|
||||
if (abuf == NULL) {
|
||||
kmem_free(dpa, sizeof (*dpa));
|
||||
return;
|
||||
}
|
||||
|
||||
dpa->dpa_curlevel--;
|
||||
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
|
||||
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
|
||||
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
|
||||
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
|
||||
if (BP_IS_HOLE(bp) || err != 0) {
|
||||
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
kmem_free(dpa, sizeof (*dpa));
|
||||
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
|
||||
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
|
||||
|
||||
+15
-2
@@ -1172,14 +1172,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
|
||||
void
|
||||
ddt_sync(spa_t *spa, uint64_t txg)
|
||||
{
|
||||
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
|
||||
dmu_tx_t *tx;
|
||||
zio_t *rio = zio_root(spa, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
|
||||
zio_t *rio;
|
||||
|
||||
ASSERT(spa_syncing_txg(spa) == txg);
|
||||
|
||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||
|
||||
rio = zio_root(spa, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
|
||||
|
||||
/*
|
||||
* This function may cause an immediate scan of ddt blocks (see
|
||||
* the comment above dsl_scan_ddt() for details). We set the
|
||||
* scan's root zio here so that we can wait for any scan IOs in
|
||||
* addition to the regular ddt IOs.
|
||||
*/
|
||||
ASSERT3P(scn->scn_zio_root, ==, NULL);
|
||||
scn->scn_zio_root = rio;
|
||||
|
||||
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
||||
ddt_t *ddt = spa->spa_ddt[c];
|
||||
if (ddt == NULL)
|
||||
@@ -1189,6 +1201,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
|
||||
}
|
||||
|
||||
(void) zio_wait(rio);
|
||||
scn->scn_zio_root = NULL;
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
}
|
||||
|
||||
@@ -520,7 +520,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
{
|
||||
prefetch_data_t *pfd = arg;
|
||||
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
|
||||
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
|
||||
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
|
||||
ARC_FLAG_PRESCIENT_PREFETCH;
|
||||
|
||||
ASSERT(pfd->pd_bytes_fetched >= 0);
|
||||
if (bp == NULL)
|
||||
|
||||
@@ -390,8 +390,10 @@ dsl_pool_close(dsl_pool_t *dp)
|
||||
mutex_destroy(&dp->dp_lock);
|
||||
cv_destroy(&dp->dp_spaceavail_cv);
|
||||
taskq_destroy(dp->dp_iput_taskq);
|
||||
if (dp->dp_blkstats)
|
||||
if (dp->dp_blkstats) {
|
||||
mutex_destroy(&dp->dp_blkstats->zab_lock);
|
||||
vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
|
||||
}
|
||||
kmem_free(dp, sizeof (dsl_pool_t));
|
||||
}
|
||||
|
||||
|
||||
+2235
-475
File diff suppressed because it is too large
Load Diff
+2
-80
@@ -971,85 +971,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
|
||||
return (AVL_CMP(r1->rs_start, r2->rs_start));
|
||||
}
|
||||
|
||||
/*
|
||||
* Create any block allocator specific components. The current allocators
|
||||
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
|
||||
*/
|
||||
static void
|
||||
metaslab_rt_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT(msp->ms_tree == NULL);
|
||||
|
||||
avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
/*
|
||||
* Destroy the block allocator specific components.
|
||||
*/
|
||||
static void
|
||||
metaslab_rt_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_tree, ==, rt);
|
||||
ASSERT0(avl_numnodes(&msp->ms_size_tree));
|
||||
|
||||
avl_destroy(&msp->ms_size_tree);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_tree, ==, rt);
|
||||
VERIFY(!msp->ms_condensing);
|
||||
avl_add(&msp->ms_size_tree, rs);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_tree, ==, rt);
|
||||
VERIFY(!msp->ms_condensing);
|
||||
avl_remove(&msp->ms_size_tree, rs);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_rt_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
metaslab_t *msp = arg;
|
||||
|
||||
ASSERT3P(rt->rt_arg, ==, msp);
|
||||
ASSERT3P(msp->ms_tree, ==, rt);
|
||||
|
||||
/*
|
||||
* Normally one would walk the tree freeing nodes along the way.
|
||||
* Since the nodes are shared with the range trees we can avoid
|
||||
* walking all nodes and just reinitialize the avl tree. The nodes
|
||||
* will be freed by the range tree, so we don't want to free them here.
|
||||
*/
|
||||
avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
static range_tree_ops_t metaslab_rt_ops = {
|
||||
metaslab_rt_create,
|
||||
metaslab_rt_destroy,
|
||||
metaslab_rt_add,
|
||||
metaslab_rt_remove,
|
||||
metaslab_rt_vacate
|
||||
};
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Common allocator routines
|
||||
@@ -1425,7 +1346,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
|
||||
* addition of new space; and for debugging, it ensures that we'd
|
||||
* data fault on any attempt to use this metaslab before it's ready.
|
||||
*/
|
||||
ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
|
||||
ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
|
||||
metaslab_rangesize_compare, &ms->ms_lock, 0);
|
||||
metaslab_group_add(mg, ms);
|
||||
|
||||
metaslab_set_fragmentation(ms);
|
||||
|
||||
+296
-28
@@ -33,8 +33,58 @@
|
||||
#include <sys/zio.h>
|
||||
#include <sys/range_tree.h>
|
||||
|
||||
/*
|
||||
* Range trees are tree-based data structures that can be used to
|
||||
* track free space or generally any space allocation information.
|
||||
* A range tree keeps track of individual segments and automatically
|
||||
* provides facilities such as adjacent extent merging and extent
|
||||
* splitting in response to range add/remove requests.
|
||||
*
|
||||
* A range tree starts out completely empty, with no segments in it.
|
||||
* Adding an allocation via range_tree_add to the range tree can either:
|
||||
* 1) create a new extent
|
||||
* 2) extend an adjacent extent
|
||||
* 3) merge two adjacent extents
|
||||
* Conversely, removing an allocation via range_tree_remove can:
|
||||
* 1) completely remove an extent
|
||||
* 2) shorten an extent (if the allocation was near one of its ends)
|
||||
* 3) split an extent into two extents, in effect punching a hole
|
||||
*
|
||||
* A range tree is also capable of 'bridging' gaps when adding
|
||||
* allocations. This is useful for cases when close proximity of
|
||||
* allocations is an important detail that needs to be represented
|
||||
* in the range tree. See range_tree_set_gap(). The default behavior
|
||||
* is not to bridge gaps (i.e. the maximum allowed gap size is 0).
|
||||
*
|
||||
* In order to traverse a range tree, use either the range_tree_walk()
|
||||
* or range_tree_vacate() functions.
|
||||
*
|
||||
* To obtain more accurate information on individual segment
|
||||
* operations that the range tree performs "under the hood", you can
|
||||
* specify a set of callbacks by passing a range_tree_ops_t structure
|
||||
* to the range_tree_create function. Any callbacks that are non-NULL
|
||||
* are then called at the appropriate times.
|
||||
*
|
||||
* The range tree code also supports a special variant of range trees
|
||||
* that can bridge small gaps between segments. This kind of tree is used
|
||||
* by the dsl scanning code to group I/Os into mostly sequential chunks to
|
||||
* optimize disk performance. The code here attempts to do this with as
|
||||
* little memory and computational overhead as possible. One limitation of
|
||||
* this implementation is that segments of range trees with gaps can only
|
||||
* support removing complete segments.
|
||||
*/
|
||||
|
||||
kmem_cache_t *range_seg_cache;
|
||||
|
||||
/* Generic ops for managing an AVL tree alongside a range tree */
|
||||
struct range_tree_ops rt_avl_ops = {
|
||||
.rtop_create = rt_avl_create,
|
||||
.rtop_destroy = rt_avl_destroy,
|
||||
.rtop_add = rt_avl_add,
|
||||
.rtop_remove = rt_avl_remove,
|
||||
.rtop_vacate = rt_avl_vacate,
|
||||
};
|
||||
|
||||
void
|
||||
range_tree_init(void)
|
||||
{
|
||||
@@ -75,6 +125,18 @@ range_tree_stat_verify(range_tree_t *rt)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Changes out the lock used by the range tree. Useful when you are moving
|
||||
* the range tree between containing structures without having to recreate
|
||||
* it. Both the old and new locks must be held by the caller.
|
||||
*/
|
||||
void
|
||||
range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
|
||||
rt->rt_lock = lp;
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
|
||||
{
|
||||
@@ -121,31 +183,38 @@ range_tree_seg_compare(const void *x1, const void *x2)
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
|
||||
range_tree_create_impl(range_tree_ops_t *ops, void *arg,
|
||||
int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap)
|
||||
{
|
||||
range_tree_t *rt;
|
||||
|
||||
rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
|
||||
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
|
||||
|
||||
avl_create(&rt->rt_root, range_tree_seg_compare,
|
||||
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
|
||||
|
||||
rt->rt_lock = lp;
|
||||
rt->rt_ops = ops;
|
||||
rt->rt_gap = gap;
|
||||
rt->rt_arg = arg;
|
||||
rt->rt_avl_compare = avl_compare;
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
|
||||
rt->rt_ops->rtop_create(rt, rt->rt_arg);
|
||||
|
||||
return (rt);
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
|
||||
{
|
||||
return (range_tree_create_impl(ops, arg, NULL, lp, 0));
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_destroy(range_tree_t *rt)
|
||||
{
|
||||
VERIFY0(rt->rt_space);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
|
||||
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
|
||||
|
||||
avl_destroy(&rt->rt_root);
|
||||
@@ -153,40 +222,102 @@ range_tree_destroy(range_tree_t *rt)
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
ASSERT3U(rs->rs_fill + delta, !=, 0);
|
||||
ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
|
||||
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
rs->rs_fill += delta;
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs_before, *rs_after, *rs;
|
||||
uint64_t end = start + size;
|
||||
uint64_t end = start + size, gap = rt->rt_gap;
|
||||
uint64_t bridge_size = 0;
|
||||
boolean_t merge_before, merge_after;
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
VERIFY(size != 0);
|
||||
ASSERT3U(size, !=, 0);
|
||||
ASSERT3U(fill, <=, size);
|
||||
|
||||
rsearch.rs_start = start;
|
||||
rsearch.rs_end = end;
|
||||
rs = avl_find(&rt->rt_root, &rsearch, &where);
|
||||
|
||||
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
|
||||
if (gap == 0 && rs != NULL &&
|
||||
rs->rs_start <= start && rs->rs_end >= end) {
|
||||
zfs_panic_recover("zfs: allocating allocated segment"
|
||||
"(offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
"(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rs->rs_start,
|
||||
(longlong_t)rs->rs_end - rs->rs_start);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make sure we don't overlap with either of our neighbors */
|
||||
VERIFY(rs == NULL);
|
||||
/*
|
||||
* If this is a gap-supporting range tree, it is possible that we
|
||||
* are inserting into an existing segment. In this case simply
|
||||
* bump the fill count and call the remove / add callbacks. If the
|
||||
* new range will extend an existing segment, we remove the
|
||||
* existing one, apply the new extent to it and re-insert it using
|
||||
* the normal code paths.
|
||||
*/
|
||||
if (rs != NULL) {
|
||||
ASSERT3U(gap, !=, 0);
|
||||
if (rs->rs_start <= start && rs->rs_end >= end) {
|
||||
range_tree_adjust_fill(rt, rs, fill);
|
||||
return;
|
||||
}
|
||||
|
||||
avl_remove(&rt->rt_root, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
rt->rt_space -= rs->rs_end - rs->rs_start;
|
||||
|
||||
fill += rs->rs_fill;
|
||||
start = MIN(start, rs->rs_start);
|
||||
end = MAX(end, rs->rs_end);
|
||||
size = end - start;
|
||||
|
||||
range_tree_add_impl(rt, start, size, fill);
|
||||
|
||||
kmem_cache_free(range_seg_cache, rs);
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT3P(rs, ==, NULL);
|
||||
|
||||
/*
|
||||
* Determine whether or not we will have to merge with our neighbors.
|
||||
* If gap != 0, we might need to merge with our neighbors even if we
|
||||
* aren't directly touching.
|
||||
*/
|
||||
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
|
||||
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
|
||||
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end == start);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start == end);
|
||||
merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
|
||||
merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
|
||||
|
||||
if (merge_before && gap != 0)
|
||||
bridge_size += start - rs_before->rs_end;
|
||||
if (merge_after && gap != 0)
|
||||
bridge_size += rs_after->rs_start - end;
|
||||
|
||||
if (merge_before && merge_after) {
|
||||
avl_remove(&rt->rt_root, rs_before);
|
||||
if (rt->rt_ops != NULL) {
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
}
|
||||
@@ -194,43 +325,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_fill += rs_before->rs_fill + fill;
|
||||
rs_after->rs_start = rs_before->rs_start;
|
||||
kmem_cache_free(range_seg_cache, rs_before);
|
||||
rs = rs_after;
|
||||
} else if (merge_before) {
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_before);
|
||||
|
||||
rs_before->rs_fill += fill;
|
||||
rs_before->rs_end = end;
|
||||
rs = rs_before;
|
||||
} else if (merge_after) {
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
|
||||
|
||||
range_tree_stat_decr(rt, rs_after);
|
||||
|
||||
rs_after->rs_fill += fill;
|
||||
rs_after->rs_start = start;
|
||||
rs = rs_after;
|
||||
} else {
|
||||
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
|
||||
|
||||
rs->rs_fill = fill;
|
||||
rs->rs_start = start;
|
||||
rs->rs_end = end;
|
||||
avl_insert(&rt->rt_root, rs, where);
|
||||
}
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (gap != 0)
|
||||
ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
|
||||
else
|
||||
ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
|
||||
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
rt->rt_space += size;
|
||||
rt->rt_space += size + bridge_size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
range_tree_add(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_add_impl(arg, start, size, size);
|
||||
}
|
||||
|
||||
static void
|
||||
range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
|
||||
boolean_t do_fill)
|
||||
{
|
||||
range_tree_t *rt = arg;
|
||||
avl_index_t where;
|
||||
range_seg_t rsearch, *rs, *newseg;
|
||||
uint64_t end = start + size;
|
||||
@@ -251,6 +398,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
(longlong_t)start, (longlong_t)size);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Range trees with gap support must only remove complete segments
|
||||
* from the tree. This allows us to maintain accurate fill accounting
|
||||
* and to ensure that bridged sections are not leaked. If we need to
|
||||
* remove less than the full segment, we can only adjust the fill count.
|
||||
*/
|
||||
if (rt->rt_gap != 0) {
|
||||
if (do_fill) {
|
||||
if (rs->rs_fill == size) {
|
||||
start = rs->rs_start;
|
||||
end = rs->rs_end;
|
||||
size = end - start;
|
||||
} else {
|
||||
range_tree_adjust_fill(rt, rs, -size);
|
||||
return;
|
||||
}
|
||||
} else if (rs->rs_start != start || rs->rs_end != end) {
|
||||
zfs_panic_recover("zfs: freeing partial segment of "
|
||||
"gap tree (offset=%llu size=%llu) of "
|
||||
"(offset=%llu size=%llu)",
|
||||
(longlong_t)start, (longlong_t)size,
|
||||
(longlong_t)rs->rs_start,
|
||||
(longlong_t)rs->rs_end - rs->rs_start);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
VERIFY3U(rs->rs_start, <=, start);
|
||||
VERIFY3U(rs->rs_end, >=, end);
|
||||
|
||||
@@ -259,19 +434,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
if (left_over && right_over) {
|
||||
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
|
||||
newseg->rs_start = end;
|
||||
newseg->rs_end = rs->rs_end;
|
||||
newseg->rs_fill = newseg->rs_end - newseg->rs_start;
|
||||
range_tree_stat_incr(rt, newseg);
|
||||
|
||||
rs->rs_end = start;
|
||||
|
||||
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
|
||||
} else if (left_over) {
|
||||
rs->rs_end = start;
|
||||
@@ -284,15 +460,55 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
}
|
||||
|
||||
if (rs != NULL) {
|
||||
/*
|
||||
* The fill of the leftover segment will always be equal to
|
||||
* the size, since we do not support removing partial segments
|
||||
* of range trees with gaps.
|
||||
*/
|
||||
rs->rs_fill = rs->rs_end - rs->rs_start;
|
||||
range_tree_stat_incr(rt, rs);
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
}
|
||||
|
||||
rt->rt_space -= size;
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove(void *arg, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_remove_impl(arg, start, size, B_FALSE);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_tree_remove_impl(rt, start, size, B_TRUE);
|
||||
}
|
||||
|
||||
void
|
||||
range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||
uint64_t newstart, uint64_t newsize)
|
||||
{
|
||||
int64_t delta = newsize - (rs->rs_end - rs->rs_start);
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
range_tree_stat_decr(rt, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
|
||||
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
|
||||
|
||||
rs->rs_start = newstart;
|
||||
rs->rs_end = newstart + newsize;
|
||||
|
||||
range_tree_stat_incr(rt, rs);
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
|
||||
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
|
||||
|
||||
rt->rt_space += delta;
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
@@ -308,7 +524,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
return (avl_find(&rt->rt_root, &rsearch, &where));
|
||||
}
|
||||
|
||||
static range_seg_t *
|
||||
range_seg_t *
|
||||
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||
{
|
||||
range_seg_t *rs = range_tree_find_impl(rt, start, size);
|
||||
@@ -373,7 +589,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
|
||||
if (rt->rt_ops != NULL)
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
|
||||
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
|
||||
|
||||
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
|
||||
@@ -397,8 +613,60 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
|
||||
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
|
||||
}
|
||||
|
||||
range_seg_t *
|
||||
range_tree_first(range_tree_t *rt)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||
return (avl_first(&rt->rt_root));
|
||||
}
|
||||
|
||||
uint64_t
|
||||
range_tree_space(range_tree_t *rt)
|
||||
{
|
||||
return (rt->rt_space);
|
||||
}
|
||||
|
||||
/* Generic range tree functions for maintaining segments in an AVL tree. */
|
||||
void
|
||||
rt_avl_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
|
||||
avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
|
||||
offsetof(range_seg_t, rs_pp_node));
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
|
||||
ASSERT0(avl_numnodes(tree));
|
||||
avl_destroy(tree);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
avl_add(tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
avl_tree_t *tree = arg;
|
||||
avl_remove(tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_avl_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
/*
|
||||
* Normally one would walk the tree freeing nodes along the way.
|
||||
* Since the nodes are shared with the range trees we can avoid
|
||||
* walking all nodes and just reinitialize the avl tree. The nodes
|
||||
* will be freed by the range tree, so we don't want to free them here.
|
||||
*/
|
||||
rt_avl_create(rt, arg);
|
||||
}
|
||||
|
||||
+3
-3
@@ -1996,7 +1996,7 @@ spa_load_verify_done(zio_t *zio)
|
||||
}
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight--;
|
||||
spa->spa_load_verify_ios--;
|
||||
cv_broadcast(&spa->spa_scrub_io_cv);
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
}
|
||||
@@ -2030,9 +2030,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
|
||||
size_t size = BP_GET_PSIZE(bp);
|
||||
|
||||
mutex_enter(&spa->spa_scrub_lock);
|
||||
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
|
||||
while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
|
||||
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
||||
spa->spa_scrub_inflight++;
|
||||
spa->spa_load_verify_ios++;
|
||||
mutex_exit(&spa->spa_scrub_lock);
|
||||
|
||||
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
|
||||
|
||||
@@ -1892,6 +1892,7 @@ spa_init(int mode)
|
||||
zpool_feature_init();
|
||||
spa_config_load();
|
||||
l2arc_start();
|
||||
scan_init();
|
||||
qat_init();
|
||||
}
|
||||
|
||||
@@ -1915,6 +1916,7 @@ spa_fini(void)
|
||||
unique_fini();
|
||||
refcount_fini();
|
||||
fm_fini();
|
||||
scan_fini();
|
||||
qat_fini();
|
||||
|
||||
avl_destroy(&spa_namespace_avl);
|
||||
@@ -2016,6 +2018,7 @@ spa_scan_stat_init(spa_t *spa)
|
||||
spa->spa_scan_pass_scrub_pause = 0;
|
||||
spa->spa_scan_pass_scrub_spent_paused = 0;
|
||||
spa->spa_scan_pass_exam = 0;
|
||||
spa->spa_scan_pass_issued = 0;
|
||||
vdev_scan_stat_init(spa->spa_root_vdev);
|
||||
}
|
||||
|
||||
@@ -2033,18 +2036,21 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
|
||||
|
||||
/* data stored on disk */
|
||||
ps->pss_func = scn->scn_phys.scn_func;
|
||||
ps->pss_state = scn->scn_phys.scn_state;
|
||||
ps->pss_start_time = scn->scn_phys.scn_start_time;
|
||||
ps->pss_end_time = scn->scn_phys.scn_end_time;
|
||||
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
|
||||
ps->pss_examined = scn->scn_phys.scn_examined;
|
||||
ps->pss_to_process = scn->scn_phys.scn_to_process;
|
||||
ps->pss_processed = scn->scn_phys.scn_processed;
|
||||
ps->pss_errors = scn->scn_phys.scn_errors;
|
||||
ps->pss_state = scn->scn_phys.scn_state;
|
||||
ps->pss_examined = scn->scn_phys.scn_examined;
|
||||
ps->pss_issued =
|
||||
scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
|
||||
|
||||
/* data not stored on disk */
|
||||
ps->pss_pass_start = spa->spa_scan_pass_start;
|
||||
ps->pss_pass_exam = spa->spa_scan_pass_exam;
|
||||
ps->pss_pass_issued = spa->spa_scan_pass_issued;
|
||||
ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
|
||||
ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
|
||||
|
||||
|
||||
@@ -360,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
||||
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
|
||||
for (int t = 0; t < DTL_TYPES; t++) {
|
||||
vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
|
||||
@@ -647,6 +648,18 @@ vdev_free(vdev_t *vd)
|
||||
{
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
|
||||
/*
|
||||
* Scan queues are normally destroyed at the end of a scan. If the
|
||||
* queue exists here, that implies the vdev is being removed while
|
||||
* the scan is still running.
|
||||
*/
|
||||
if (vd->vdev_scan_io_queue != NULL) {
|
||||
mutex_enter(&vd->vdev_scan_io_queue_lock);
|
||||
dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
|
||||
vd->vdev_scan_io_queue = NULL;
|
||||
mutex_exit(&vd->vdev_scan_io_queue_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* vdev_free() implies closing the vdev first. This is simpler than
|
||||
* trying to ensure complicated semantics for all callers.
|
||||
@@ -723,6 +736,7 @@ vdev_free(vdev_t *vd)
|
||||
mutex_destroy(&vd->vdev_dtl_lock);
|
||||
mutex_destroy(&vd->vdev_stat_lock);
|
||||
mutex_destroy(&vd->vdev_probe_lock);
|
||||
mutex_destroy(&vd->vdev_scan_io_queue_lock);
|
||||
|
||||
zfs_ratelimit_fini(&vd->vdev_delay_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
|
||||
@@ -800,6 +814,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
|
||||
|
||||
tvd->vdev_islog = svd->vdev_islog;
|
||||
svd->vdev_islog = 0;
|
||||
|
||||
dsl_scan_io_queue_vdev_xfer(svd, tvd);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
|
||||
* we include spans of optional I/Os to aid aggregation at the disk even when
|
||||
* they aren't able to help us aggregate at this level.
|
||||
*/
|
||||
int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
|
||||
int zfs_vdev_aggregation_limit = 1 << 20;
|
||||
int zfs_vdev_read_gap_limit = 32 << 10;
|
||||
int zfs_vdev_write_gap_limit = 4 << 10;
|
||||
|
||||
|
||||
+1
-1
@@ -1070,7 +1070,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
|
||||
}
|
||||
err = zap_add(os, intoobj, za.za_name,
|
||||
8, 1, &value, tx);
|
||||
if (err)
|
||||
if (err != 0)
|
||||
break;
|
||||
}
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
+2
-20
@@ -39,6 +39,7 @@
|
||||
#include <sys/ddt.h>
|
||||
#include <sys/blkptr.h>
|
||||
#include <sys/zfeature.h>
|
||||
#include <sys/dsl_scan.h>
|
||||
#include <sys/metaslab_impl.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/trace_zio.h>
|
||||
@@ -1050,6 +1051,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
||||
|
||||
metaslab_check_free(spa, bp);
|
||||
arc_freed(spa, bp);
|
||||
dsl_scan_freed(spa, bp);
|
||||
|
||||
/*
|
||||
* GANG and DEDUP blocks can induce a read (for the gang block header,
|
||||
@@ -3333,26 +3335,6 @@ zio_vdev_io_start(zio_t *zio)
|
||||
|
||||
ASSERT3P(zio->io_logical, !=, zio);
|
||||
|
||||
/*
|
||||
* We keep track of time-sensitive I/Os so that the scan thread
|
||||
* can quickly react to certain workloads. In particular, we care
|
||||
* about non-scrubbing, top-level reads and writes with the following
|
||||
* characteristics:
|
||||
* - synchronous writes of user data to non-slog devices
|
||||
* - any reads of user data
|
||||
* When these conditions are met, adjust the timestamp of spa_last_io
|
||||
* which allows the scan thread to adjust its workload accordingly.
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
|
||||
vd == vd->vdev_top && !vd->vdev_islog &&
|
||||
zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
|
||||
zio->io_txg != spa_syncing_txg(spa)) {
|
||||
uint64_t old = spa->spa_last_io;
|
||||
uint64_t new = ddi_get_lbolt64();
|
||||
if (old != new)
|
||||
(void) atomic_cas_64(&spa->spa_last_io, old, new);
|
||||
}
|
||||
|
||||
align = 1ULL << vd->vdev_top->vdev_ashift;
|
||||
|
||||
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
|
||||
|
||||
Reference in New Issue
Block a user