mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-26 09:54:22 +03:00
Several sorted scrub optimizations
- Reduce size and comparison complexity of q_exts_by_size B-tree. Previous code used two 64-bit divisions and many other operations to compare two B-tree elements. It created enormous overhead. This implementation moves the math to the upper level and stores the score in the B-tree elements themselves. Since all that we need to store in that B-tree is the extent score and offset, those can fit into single 8 byte value instead of 24 bytes of q_exts_by_addr element and can be compared with single operation. - Better decouple secondary tree logic from main range_tree by moving rt_btree_ops and related functions into dsl_scan.c as ext_size_ops. Those functions are very small to worry about the code duplication and range_tree does not need to know details such as rt_btree_compare. - Instead of accounting number of pending bytes per pool, that needs atomic on global variable per block, account the number of non-empty per-vdev queues, that change much more rarely. - When extent scan is interrupted by TXG end, continue it in the next TXG instead of selecting next best extent. It allows to avoid leaving one truncated (and so likely not the best any more) extent each TXG. On top of some other optimizations this saves about 1.5 minutes out of 10 to scrub pool of 12 SSDs, storing 1.5TB of 4KB zvol blocks. Reviewed-by: Paul Dagnelie <pcd@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <caputit1@tcnj.edu> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #13576
This commit is contained in:
parent
83691bebf0
commit
1c0c729ab4
@ -155,7 +155,7 @@ typedef struct dsl_scan {
|
||||
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
|
||||
dsl_scan_phys_t scn_phys_cached;
|
||||
avl_tree_t scn_queue; /* queue of datasets to scan */
|
||||
uint64_t scn_bytes_pending; /* outstanding data to issue */
|
||||
uint64_t scn_queues_pending; /* outstanding data to issue */
|
||||
} dsl_scan_t;
|
||||
|
||||
typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
|
||||
|
@ -64,11 +64,7 @@ typedef struct range_tree {
|
||||
uint8_t rt_shift;
|
||||
uint64_t rt_start;
|
||||
const range_tree_ops_t *rt_ops;
|
||||
|
||||
/* rt_btree_compare should only be set if rt_arg is a b-tree */
|
||||
void *rt_arg;
|
||||
int (*rt_btree_compare)(const void *, const void *);
|
||||
|
||||
uint64_t rt_gap; /* allowable inter-segment gap */
|
||||
|
||||
/*
|
||||
@ -278,9 +274,9 @@ rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
|
||||
|
||||
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
|
||||
|
||||
range_tree_t *range_tree_create_impl(const range_tree_ops_t *ops,
|
||||
range_tree_t *range_tree_create_gap(const range_tree_ops_t *ops,
|
||||
range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
|
||||
int (*zfs_btree_compare) (const void *, const void *), uint64_t gap);
|
||||
uint64_t gap);
|
||||
range_tree_t *range_tree_create(const range_tree_ops_t *ops,
|
||||
range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
|
||||
void range_tree_destroy(range_tree_t *rt);
|
||||
@ -316,13 +312,6 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
|
||||
void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
|
||||
range_tree_t *addto);
|
||||
|
||||
void rt_btree_create(range_tree_t *rt, void *arg);
|
||||
void rt_btree_destroy(range_tree_t *rt, void *arg);
|
||||
void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
|
||||
void rt_btree_vacate(range_tree_t *rt, void *arg);
|
||||
extern const range_tree_ops_t rt_btree_ops;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -220,9 +220,9 @@ typedef struct {
|
||||
|
||||
/*
|
||||
* This controls what conditions are placed on dsl_scan_sync_state():
|
||||
* SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
|
||||
* SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
|
||||
* SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
|
||||
* SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
|
||||
* SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
|
||||
* SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
|
||||
* write out the scn_phys_cached version.
|
||||
* See dsl_scan_sync_state for details.
|
||||
*/
|
||||
@ -284,9 +284,10 @@ struct dsl_scan_io_queue {
|
||||
|
||||
/* trees used for sorting I/Os and extents of I/Os */
|
||||
range_tree_t *q_exts_by_addr;
|
||||
zfs_btree_t q_exts_by_size;
|
||||
zfs_btree_t q_exts_by_size;
|
||||
avl_tree_t q_sios_by_addr;
|
||||
uint64_t q_sio_memused;
|
||||
uint64_t q_last_ext_addr;
|
||||
|
||||
/* members for zio rate limiting */
|
||||
uint64_t q_maxinflight_bytes;
|
||||
@ -644,7 +645,7 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
|
||||
* Because we can be running in the block sorting algorithm, we do not always
|
||||
* want to write out the record, only when it is "safe" to do so. This safety
|
||||
* condition is achieved by making sure that the sorting queues are empty
|
||||
* (scn_bytes_pending == 0). When this condition is not true, the sync'd state
|
||||
* (scn_queues_pending == 0). When this condition is not true, the sync'd state
|
||||
* is inconsistent with how much actual scanning progress has been made. The
|
||||
* kind of sync to be performed is specified by the sync_type argument. If the
|
||||
* sync is optional, we only sync if the queues are empty. If the sync is
|
||||
@ -667,8 +668,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
|
||||
int i;
|
||||
spa_t *spa = scn->scn_dp->dp_spa;
|
||||
|
||||
ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
|
||||
if (scn->scn_bytes_pending == 0) {
|
||||
ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
|
||||
if (scn->scn_queues_pending == 0) {
|
||||
for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
|
||||
vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
|
||||
dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
|
||||
@ -1206,7 +1207,7 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
|
||||
dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
|
||||
DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
|
||||
|
||||
ASSERT0(scn->scn_bytes_pending);
|
||||
ASSERT0(scn->scn_queues_pending);
|
||||
ASSERT(scn->scn_phys.scn_queue_obj != 0);
|
||||
|
||||
VERIFY0(dmu_object_free(dp->dp_meta_objset,
|
||||
@ -1278,11 +1279,12 @@ dsl_scan_should_clear(dsl_scan_t *scn)
|
||||
queue = tvd->vdev_scan_io_queue;
|
||||
if (queue != NULL) {
|
||||
/*
|
||||
* # of extents in exts_by_size = # in exts_by_addr.
|
||||
* # of extents in exts_by_addr = # in exts_by_size.
|
||||
* B-tree efficiency is ~75%, but can be as low as 50%.
|
||||
*/
|
||||
mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
|
||||
3 * sizeof (range_seg_gap_t) + queue->q_sio_memused;
|
||||
((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
|
||||
3 / 2) + queue->q_sio_memused;
|
||||
}
|
||||
mutex_exit(&tvd->vdev_scan_io_queue_lock);
|
||||
}
|
||||
@ -1290,7 +1292,7 @@ dsl_scan_should_clear(dsl_scan_t *scn)
|
||||
dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
|
||||
|
||||
if (mused == 0)
|
||||
ASSERT0(scn->scn_bytes_pending);
|
||||
ASSERT0(scn->scn_queues_pending);
|
||||
|
||||
/*
|
||||
* If we are above our hard limit, we need to clear out memory.
|
||||
@ -2855,7 +2857,6 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
|
||||
{
|
||||
dsl_scan_t *scn = queue->q_scn;
|
||||
scan_io_t *sio;
|
||||
int64_t bytes_issued = 0;
|
||||
boolean_t suspended = B_FALSE;
|
||||
|
||||
while ((sio = list_head(io_list)) != NULL) {
|
||||
@ -2867,16 +2868,12 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
|
||||
}
|
||||
|
||||
sio2bp(sio, &bp);
|
||||
bytes_issued += SIO_GET_ASIZE(sio);
|
||||
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
|
||||
&sio->sio_zb, queue);
|
||||
(void) list_remove_head(io_list);
|
||||
scan_io_queues_update_zio_stats(queue, &bp);
|
||||
sio_free(sio);
|
||||
}
|
||||
|
||||
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
|
||||
|
||||
return (suspended);
|
||||
}
|
||||
|
||||
@ -2921,6 +2918,8 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
|
||||
|
||||
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
|
||||
avl_remove(&queue->q_sios_by_addr, sio);
|
||||
if (avl_is_empty(&queue->q_sios_by_addr))
|
||||
atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
|
||||
queue->q_sio_memused -= SIO_GET_MUSED(sio);
|
||||
|
||||
bytes_issued += SIO_GET_ASIZE(sio);
|
||||
@ -2942,12 +2941,13 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
|
||||
range_tree_resize_segment(queue->q_exts_by_addr, rs,
|
||||
SIO_GET_OFFSET(sio), rs_get_end(rs,
|
||||
queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
|
||||
|
||||
queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
|
||||
return (B_TRUE);
|
||||
} else {
|
||||
uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
|
||||
uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
|
||||
range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
|
||||
queue->q_last_ext_addr = -1;
|
||||
return (B_FALSE);
|
||||
}
|
||||
}
|
||||
@ -2972,31 +2972,8 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
|
||||
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
|
||||
ASSERT(scn->scn_is_sorted);
|
||||
|
||||
/* handle tunable overrides */
|
||||
if (scn->scn_checkpointing || scn->scn_clearing) {
|
||||
if (zfs_scan_issue_strategy == 1) {
|
||||
return (range_tree_first(rt));
|
||||
} else if (zfs_scan_issue_strategy == 2) {
|
||||
/*
|
||||
* We need to get the original entry in the by_addr
|
||||
* tree so we can modify it.
|
||||
*/
|
||||
range_seg_t *size_rs =
|
||||
zfs_btree_first(&queue->q_exts_by_size, NULL);
|
||||
if (size_rs == NULL)
|
||||
return (NULL);
|
||||
uint64_t start = rs_get_start(size_rs, rt);
|
||||
uint64_t size = rs_get_end(size_rs, rt) - start;
|
||||
range_seg_t *addr_rs = range_tree_find(rt, start,
|
||||
size);
|
||||
ASSERT3P(addr_rs, !=, NULL);
|
||||
ASSERT3U(rs_get_start(size_rs, rt), ==,
|
||||
rs_get_start(addr_rs, rt));
|
||||
ASSERT3U(rs_get_end(size_rs, rt), ==,
|
||||
rs_get_end(addr_rs, rt));
|
||||
return (addr_rs);
|
||||
}
|
||||
}
|
||||
if (!scn->scn_checkpointing && !scn->scn_clearing)
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* During normal clearing, we want to issue our largest segments
|
||||
@ -3007,28 +2984,42 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
|
||||
* so the way we are sorted now is as good as it will ever get.
|
||||
* In this case, we instead switch to issuing extents in LBA order.
|
||||
*/
|
||||
if (scn->scn_checkpointing) {
|
||||
if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
|
||||
zfs_scan_issue_strategy == 1)
|
||||
return (range_tree_first(rt));
|
||||
} else if (scn->scn_clearing) {
|
||||
/*
|
||||
* We need to get the original entry in the by_addr
|
||||
* tree so we can modify it.
|
||||
*/
|
||||
range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
|
||||
NULL);
|
||||
if (size_rs == NULL)
|
||||
return (NULL);
|
||||
uint64_t start = rs_get_start(size_rs, rt);
|
||||
uint64_t size = rs_get_end(size_rs, rt) - start;
|
||||
range_seg_t *addr_rs = range_tree_find(rt, start, size);
|
||||
ASSERT3P(addr_rs, !=, NULL);
|
||||
ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
|
||||
rt));
|
||||
ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
|
||||
return (addr_rs);
|
||||
} else {
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* Try to continue previous extent if it is not completed yet. After
|
||||
* shrink in scan_io_queue_gather() it may no longer be the best, but
|
||||
* otherwise we leave shorter remnant every txg.
|
||||
*/
|
||||
uint64_t start;
|
||||
uint64_t size = 1 << rt->rt_shift;
|
||||
range_seg_t *addr_rs;
|
||||
if (queue->q_last_ext_addr != -1) {
|
||||
start = queue->q_last_ext_addr;
|
||||
addr_rs = range_tree_find(rt, start, size);
|
||||
if (addr_rs != NULL)
|
||||
return (addr_rs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Nothing to continue, so find new best extent.
|
||||
*/
|
||||
uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
|
||||
if (v == NULL)
|
||||
return (NULL);
|
||||
queue->q_last_ext_addr = start = *v << rt->rt_shift;
|
||||
|
||||
/*
|
||||
* We need to get the original entry in the by_addr tree so we can
|
||||
* modify it.
|
||||
*/
|
||||
addr_rs = range_tree_find(rt, start, size);
|
||||
ASSERT3P(addr_rs, !=, NULL);
|
||||
ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
|
||||
ASSERT3U(rs_get_end(addr_rs, rt), >, start);
|
||||
return (addr_rs);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3064,12 +3055,12 @@ scan_io_queues_run_one(void *arg)
|
||||
/* loop until we run out of time or sios */
|
||||
while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
|
||||
uint64_t seg_start = 0, seg_end = 0;
|
||||
boolean_t more_left = B_TRUE;
|
||||
boolean_t more_left;
|
||||
|
||||
ASSERT(list_is_empty(&sio_list));
|
||||
|
||||
/* loop while we still have sios left to process in this rs */
|
||||
while (more_left) {
|
||||
do {
|
||||
scan_io_t *first_sio, *last_sio;
|
||||
|
||||
/*
|
||||
@ -3098,7 +3089,7 @@ scan_io_queues_run_one(void *arg)
|
||||
|
||||
if (suspended)
|
||||
break;
|
||||
}
|
||||
} while (more_left);
|
||||
|
||||
/* update statistics for debugging purposes */
|
||||
scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
|
||||
@ -3139,7 +3130,7 @@ scan_io_queues_run(dsl_scan_t *scn)
|
||||
ASSERT(scn->scn_is_sorted);
|
||||
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
|
||||
|
||||
if (scn->scn_bytes_pending == 0)
|
||||
if (scn->scn_queues_pending == 0)
|
||||
return;
|
||||
|
||||
if (scn->scn_taskq == NULL) {
|
||||
@ -3771,7 +3762,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
spa->spa_name,
|
||||
(longlong_t)tx->tx_txg);
|
||||
}
|
||||
} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
|
||||
} else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
|
||||
ASSERT(scn->scn_clearing);
|
||||
|
||||
/* need to issue scrubbing IOs from per-vdev queues */
|
||||
@ -3801,7 +3792,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
||||
spa->spa_name);
|
||||
ASSERT3U(scn->scn_done_txg, !=, 0);
|
||||
ASSERT0(spa->spa_scrub_inflight);
|
||||
ASSERT0(scn->scn_bytes_pending);
|
||||
ASSERT0(scn->scn_queues_pending);
|
||||
dsl_scan_done(scn, B_TRUE, tx);
|
||||
sync_type = SYNC_MANDATORY;
|
||||
}
|
||||
@ -3892,20 +3883,21 @@ static void
|
||||
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
|
||||
{
|
||||
avl_index_t idx;
|
||||
int64_t asize = SIO_GET_ASIZE(sio);
|
||||
dsl_scan_t *scn = queue->q_scn;
|
||||
|
||||
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
|
||||
|
||||
if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
|
||||
atomic_add_64(&scn->scn_queues_pending, 1);
|
||||
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
|
||||
/* block is already scheduled for reading */
|
||||
atomic_add_64(&scn->scn_bytes_pending, -asize);
|
||||
sio_free(sio);
|
||||
return;
|
||||
}
|
||||
avl_insert(&queue->q_sios_by_addr, sio, idx);
|
||||
queue->q_sio_memused += SIO_GET_MUSED(sio);
|
||||
range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
|
||||
range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
|
||||
SIO_GET_ASIZE(sio));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3918,7 +3910,6 @@ static void
|
||||
scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
|
||||
int zio_flags, const zbookmark_phys_t *zb)
|
||||
{
|
||||
dsl_scan_t *scn = queue->q_scn;
|
||||
scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
|
||||
|
||||
ASSERT0(BP_IS_GANG(bp));
|
||||
@ -3928,13 +3919,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
|
||||
sio->sio_flags = zio_flags;
|
||||
sio->sio_zb = *zb;
|
||||
|
||||
/*
|
||||
* Increment the bytes pending counter now so that we can't
|
||||
* get an integer underflow in case the worker processes the
|
||||
* zio before we get to incrementing this counter.
|
||||
*/
|
||||
atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
|
||||
|
||||
queue->q_last_ext_addr = -1;
|
||||
scan_io_queue_insert_impl(queue, sio);
|
||||
}
|
||||
|
||||
@ -4020,8 +4005,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
|
||||
* Keep track of how much data we've examined so that
|
||||
* zpool(8) status can make useful progress reports.
|
||||
*/
|
||||
scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
|
||||
spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
|
||||
uint64_t asize = DVA_GET_ASIZE(dva);
|
||||
scn->scn_phys.scn_examined += asize;
|
||||
spa->spa_scan_pass_exam += asize;
|
||||
|
||||
/* if it's a resilver, this may not be in the target range */
|
||||
if (!needs_io)
|
||||
@ -4142,33 +4128,88 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
|
||||
* extents that are more completely filled (in a 3:2 ratio) vs just larger.
|
||||
* Note that as an optimization, we replace multiplication and division by
|
||||
* 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
|
||||
*
|
||||
* Since we do not care if one extent is only few percent better than another,
|
||||
* compress the score into 6 bits via binary logarithm AKA highbit64() and
|
||||
* put into otherwise unused due to ashift high bits of offset. This allows
|
||||
* to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
|
||||
* with single operation. Plus it makes scrubs more sequential and reduces
|
||||
* chances that minor extent change move it within the B-tree.
|
||||
*/
|
||||
static int
|
||||
ext_size_compare(const void *x, const void *y)
|
||||
{
|
||||
const range_seg_gap_t *rsa = x, *rsb = y;
|
||||
const uint64_t *a = x, *b = y;
|
||||
|
||||
uint64_t sa = rsa->rs_end - rsa->rs_start;
|
||||
uint64_t sb = rsb->rs_end - rsb->rs_start;
|
||||
uint64_t score_a, score_b;
|
||||
|
||||
score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
|
||||
fill_weight * rsa->rs_fill) >> 7);
|
||||
score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
|
||||
fill_weight * rsb->rs_fill) >> 7);
|
||||
|
||||
if (score_a > score_b)
|
||||
return (-1);
|
||||
if (score_a == score_b) {
|
||||
if (rsa->rs_start < rsb->rs_start)
|
||||
return (-1);
|
||||
if (rsa->rs_start == rsb->rs_start)
|
||||
return (0);
|
||||
return (1);
|
||||
}
|
||||
return (1);
|
||||
return (TREE_CMP(*a, *b));
|
||||
}
|
||||
|
||||
static void
|
||||
ext_size_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
(void) rt;
|
||||
zfs_btree_t *size_tree = arg;
|
||||
|
||||
zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
|
||||
}
|
||||
|
||||
static void
|
||||
ext_size_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
(void) rt;
|
||||
zfs_btree_t *size_tree = arg;
|
||||
ASSERT0(zfs_btree_numnodes(size_tree));
|
||||
|
||||
zfs_btree_destroy(size_tree);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
|
||||
{
|
||||
(void) rt;
|
||||
uint64_t size = rsg->rs_end - rsg->rs_start;
|
||||
uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
|
||||
fill_weight * rsg->rs_fill) >> 7);
|
||||
ASSERT3U(rt->rt_shift, >=, 8);
|
||||
return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
|
||||
}
|
||||
|
||||
static void
|
||||
ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
zfs_btree_t *size_tree = arg;
|
||||
ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
|
||||
uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
|
||||
zfs_btree_add(size_tree, &v);
|
||||
}
|
||||
|
||||
static void
|
||||
ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
zfs_btree_t *size_tree = arg;
|
||||
ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
|
||||
uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
|
||||
zfs_btree_remove(size_tree, &v);
|
||||
}
|
||||
|
||||
static void
|
||||
ext_size_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
zfs_btree_t *size_tree = arg;
|
||||
zfs_btree_clear(size_tree);
|
||||
zfs_btree_destroy(size_tree);
|
||||
|
||||
ext_size_create(rt, arg);
|
||||
}
|
||||
|
||||
static const range_tree_ops_t ext_size_ops = {
|
||||
.rtop_create = ext_size_create,
|
||||
.rtop_destroy = ext_size_destroy,
|
||||
.rtop_add = ext_size_add,
|
||||
.rtop_remove = ext_size_remove,
|
||||
.rtop_vacate = ext_size_vacate
|
||||
};
|
||||
|
||||
/*
|
||||
* Comparator for the q_sios_by_addr tree. Sorting is simply performed
|
||||
* based on LBA-order (from lowest to highest).
|
||||
@ -4191,9 +4232,10 @@ scan_io_queue_create(vdev_t *vd)
|
||||
q->q_scn = scn;
|
||||
q->q_vd = vd;
|
||||
q->q_sio_memused = 0;
|
||||
q->q_last_ext_addr = -1;
|
||||
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
|
||||
q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
|
||||
&q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
|
||||
q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
|
||||
&q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
|
||||
avl_create(&q->q_sios_by_addr, sio_addr_compare,
|
||||
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
|
||||
|
||||
@ -4211,21 +4253,20 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
|
||||
dsl_scan_t *scn = queue->q_scn;
|
||||
scan_io_t *sio;
|
||||
void *cookie = NULL;
|
||||
int64_t bytes_dequeued = 0;
|
||||
|
||||
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
|
||||
|
||||
if (!avl_is_empty(&queue->q_sios_by_addr))
|
||||
atomic_add_64(&scn->scn_queues_pending, -1);
|
||||
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
|
||||
NULL) {
|
||||
ASSERT(range_tree_contains(queue->q_exts_by_addr,
|
||||
SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
|
||||
bytes_dequeued += SIO_GET_ASIZE(sio);
|
||||
queue->q_sio_memused -= SIO_GET_MUSED(sio);
|
||||
sio_free(sio);
|
||||
}
|
||||
|
||||
ASSERT0(queue->q_sio_memused);
|
||||
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
|
||||
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
|
||||
range_tree_destroy(queue->q_exts_by_addr);
|
||||
avl_destroy(&queue->q_sios_by_addr);
|
||||
@ -4321,25 +4362,19 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
|
||||
sio_free(srch_sio);
|
||||
|
||||
if (sio != NULL) {
|
||||
int64_t asize = SIO_GET_ASIZE(sio);
|
||||
blkptr_t tmpbp;
|
||||
|
||||
/* Got it while it was cold in the queue */
|
||||
ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
|
||||
ASSERT3U(size, ==, asize);
|
||||
ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
|
||||
avl_remove(&queue->q_sios_by_addr, sio);
|
||||
if (avl_is_empty(&queue->q_sios_by_addr))
|
||||
atomic_add_64(&scn->scn_queues_pending, -1);
|
||||
queue->q_sio_memused -= SIO_GET_MUSED(sio);
|
||||
|
||||
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
|
||||
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
|
||||
|
||||
/*
|
||||
* We only update scn_bytes_pending in the cold path,
|
||||
* otherwise it will already have been accounted for as
|
||||
* part of the zio's execution.
|
||||
*/
|
||||
atomic_add_64(&scn->scn_bytes_pending, -asize);
|
||||
|
||||
/* count the block as though we issued it */
|
||||
sio2bp(sio, &tmpbp);
|
||||
count_block(scn, dp->dp_blkstats, &tmpbp);
|
||||
|
@ -188,10 +188,8 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
|
||||
}
|
||||
|
||||
range_tree_t *
|
||||
range_tree_create_impl(const range_tree_ops_t *ops, range_seg_type_t type,
|
||||
void *arg, uint64_t start, uint64_t shift,
|
||||
int (*zfs_btree_compare) (const void *, const void *),
|
||||
uint64_t gap)
|
||||
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
|
||||
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
|
||||
{
|
||||
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
|
||||
|
||||
@ -223,7 +221,6 @@ range_tree_create_impl(const range_tree_ops_t *ops, range_seg_type_t type,
|
||||
rt->rt_type = type;
|
||||
rt->rt_start = start;
|
||||
rt->rt_shift = shift;
|
||||
rt->rt_btree_compare = zfs_btree_compare;
|
||||
|
||||
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
|
||||
rt->rt_ops->rtop_create(rt, rt->rt_arg);
|
||||
@ -235,7 +232,7 @@ range_tree_t *
|
||||
range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type,
|
||||
void *arg, uint64_t start, uint64_t shift)
|
||||
{
|
||||
return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
|
||||
return (range_tree_create_gap(ops, type, arg, start, shift, 0));
|
||||
}
|
||||
|
||||
void
|
||||
@ -741,74 +738,6 @@ range_tree_is_empty(range_tree_t *rt)
|
||||
return (range_tree_space(rt) == 0);
|
||||
}
|
||||
|
||||
void
|
||||
rt_btree_create(range_tree_t *rt, void *arg)
|
||||
{
|
||||
zfs_btree_t *size_tree = arg;
|
||||
|
||||
size_t size;
|
||||
switch (rt->rt_type) {
|
||||
case RANGE_SEG32:
|
||||
size = sizeof (range_seg32_t);
|
||||
break;
|
||||
case RANGE_SEG64:
|
||||
size = sizeof (range_seg64_t);
|
||||
break;
|
||||
case RANGE_SEG_GAP:
|
||||
size = sizeof (range_seg_gap_t);
|
||||
break;
|
||||
default:
|
||||
panic("Invalid range seg type %d", rt->rt_type);
|
||||
}
|
||||
zfs_btree_create(size_tree, rt->rt_btree_compare, size);
|
||||
}
|
||||
|
||||
void
|
||||
rt_btree_destroy(range_tree_t *rt, void *arg)
|
||||
{
|
||||
(void) rt;
|
||||
zfs_btree_t *size_tree = arg;
|
||||
ASSERT0(zfs_btree_numnodes(size_tree));
|
||||
|
||||
zfs_btree_destroy(size_tree);
|
||||
}
|
||||
|
||||
void
|
||||
rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
(void) rt;
|
||||
zfs_btree_t *size_tree = arg;
|
||||
|
||||
zfs_btree_add(size_tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
|
||||
{
|
||||
(void) rt;
|
||||
zfs_btree_t *size_tree = arg;
|
||||
|
||||
zfs_btree_remove(size_tree, rs);
|
||||
}
|
||||
|
||||
void
|
||||
rt_btree_vacate(range_tree_t *rt, void *arg)
|
||||
{
|
||||
zfs_btree_t *size_tree = arg;
|
||||
zfs_btree_clear(size_tree);
|
||||
zfs_btree_destroy(size_tree);
|
||||
|
||||
rt_btree_create(rt, arg);
|
||||
}
|
||||
|
||||
const range_tree_ops_t rt_btree_ops = {
|
||||
.rtop_create = rt_btree_create,
|
||||
.rtop_destroy = rt_btree_destroy,
|
||||
.rtop_add = rt_btree_add,
|
||||
.rtop_remove = rt_btree_remove,
|
||||
.rtop_vacate = rt_btree_vacate
|
||||
};
|
||||
|
||||
/*
|
||||
* Remove any overlapping ranges between the given segment [start, end)
|
||||
* from removefrom. Add non-overlapping leftovers to addto.
|
||||
|
Loading…
Reference in New Issue
Block a user