mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-11-18 02:20:59 +03:00
Improve scrub maxinflight_bytes math.
Previously, ZFS scaled maxinflight_bytes based on total number of disks in the pool. A 3-wide mirror was receiving a queue depth of 3 disks, which it should not, since it reads from all the disks inside. For wide raidz the situation was slightly better, but still a 3-wide raidz1 received a depth of 3 disks instead of 2. The new code counts only unique data disks, i.e. 1 disk for mirrors and non-parity disks for raidz/draid. For draid the math is still imperfect, since vdev_get_nparity() returns number of parity disks per group, not per vdev, but still some better than it was. This should slightly reduce scrub influence on payload for some pool topologies by avoiding excessive queuing. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closing #12046
This commit is contained in:
parent
ec3b25825e
commit
09c0a8fd1a
@ -3326,7 +3326,7 @@ Default value: \fB0\fR.
|
|||||||
Maximum amount of data that can be concurrently issued at once for scrubs and
|
Maximum amount of data that can be concurrently issued at once for scrubs and
|
||||||
resilvers per leaf device, given in bytes.
|
resilvers per leaf device, given in bytes.
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB41943040\fR.
|
Default value: \fB4194304\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
|
@ -126,7 +126,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
|
|||||||
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
|
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
|
||||||
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
|
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
|
||||||
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
|
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
|
||||||
static uint64_t dsl_scan_count_leaves(vdev_t *vd);
|
static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
|
||||||
|
|
||||||
extern int zfs_vdev_async_write_active_min_dirty_percent;
|
extern int zfs_vdev_async_write_active_min_dirty_percent;
|
||||||
|
|
||||||
@ -451,7 +451,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
|||||||
* phase are done per top-level vdev and are handled separately.
|
* phase are done per top-level vdev and are handled separately.
|
||||||
*/
|
*/
|
||||||
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
|
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
|
||||||
dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
|
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
|
||||||
|
|
||||||
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
|
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
|
||||||
offsetof(scan_ds_t, sds_node));
|
offsetof(scan_ds_t, sds_node));
|
||||||
@ -2759,22 +2759,16 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t
|
static uint64_t
|
||||||
dsl_scan_count_leaves(vdev_t *vd)
|
dsl_scan_count_data_disks(vdev_t *rvd)
|
||||||
{
|
{
|
||||||
uint64_t i, leaves = 0;
|
uint64_t i, leaves = 0;
|
||||||
|
|
||||||
/* we only count leaves that belong to the main pool and are readable */
|
for (i = 0; i < rvd->vdev_children; i++) {
|
||||||
if (vd->vdev_islog || vd->vdev_isspare ||
|
vdev_t *vd = rvd->vdev_child[i];
|
||||||
vd->vdev_isl2cache || !vdev_readable(vd))
|
if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
|
||||||
return (0);
|
continue;
|
||||||
|
leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
|
||||||
if (vd->vdev_ops->vdev_op_leaf)
|
|
||||||
return (1);
|
|
||||||
|
|
||||||
for (i = 0; i < vd->vdev_children; i++) {
|
|
||||||
leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return (leaves);
|
return (leaves);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3017,8 +3011,6 @@ scan_io_queues_run_one(void *arg)
|
|||||||
range_seg_t *rs = NULL;
|
range_seg_t *rs = NULL;
|
||||||
scan_io_t *sio = NULL;
|
scan_io_t *sio = NULL;
|
||||||
list_t sio_list;
|
list_t sio_list;
|
||||||
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
|
|
||||||
uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
|
|
||||||
|
|
||||||
ASSERT(queue->q_scn->scn_is_sorted);
|
ASSERT(queue->q_scn->scn_is_sorted);
|
||||||
|
|
||||||
@ -3026,9 +3018,9 @@ scan_io_queues_run_one(void *arg)
|
|||||||
offsetof(scan_io_t, sio_nodes.sio_list_node));
|
offsetof(scan_io_t, sio_nodes.sio_list_node));
|
||||||
mutex_enter(q_lock);
|
mutex_enter(q_lock);
|
||||||
|
|
||||||
/* calculate maximum in-flight bytes for this txg (min 1MB) */
|
/* Calculate maximum in-flight bytes for this vdev. */
|
||||||
queue->q_maxinflight_bytes =
|
queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
|
||||||
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
|
(vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
|
||||||
|
|
||||||
/* reset per-queue scan statistics for this txg */
|
/* reset per-queue scan statistics for this txg */
|
||||||
queue->q_total_seg_size_this_txg = 0;
|
queue->q_total_seg_size_this_txg = 0;
|
||||||
@ -3665,16 +3657,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
|||||||
/* Need to scan metadata for more blocks to scrub */
|
/* Need to scan metadata for more blocks to scrub */
|
||||||
dsl_scan_phys_t *scnp = &scn->scn_phys;
|
dsl_scan_phys_t *scnp = &scn->scn_phys;
|
||||||
taskqid_t prefetch_tqid;
|
taskqid_t prefetch_tqid;
|
||||||
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
|
|
||||||
uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Recalculate the max number of in-flight bytes for pool-wide
|
* Recalculate the max number of in-flight bytes for pool-wide
|
||||||
* scanning operations (minimum 1MB). Limits for the issuing
|
* scanning operations (minimum 1MB). Limits for the issuing
|
||||||
* phase are done per top-level vdev and are handled separately.
|
* phase are done per top-level vdev and are handled separately.
|
||||||
*/
|
*/
|
||||||
scn->scn_maxinflight_bytes =
|
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
|
||||||
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
|
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
|
||||||
|
|
||||||
if (scnp->scn_ddt_bookmark.ddb_class <=
|
if (scnp->scn_ddt_bookmark.ddb_class <=
|
||||||
scnp->scn_ddt_class_max) {
|
scnp->scn_ddt_class_max) {
|
||||||
@ -4050,9 +4040,8 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
|
|||||||
size_t size = BP_GET_PSIZE(bp);
|
size_t size = BP_GET_PSIZE(bp);
|
||||||
abd_t *data = abd_alloc_for_io(size, B_FALSE);
|
abd_t *data = abd_alloc_for_io(size, B_FALSE);
|
||||||
|
|
||||||
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
|
|
||||||
|
|
||||||
if (queue == NULL) {
|
if (queue == NULL) {
|
||||||
|
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
|
||||||
mutex_enter(&spa->spa_scrub_lock);
|
mutex_enter(&spa->spa_scrub_lock);
|
||||||
while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
|
while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
|
||||||
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
|
||||||
@ -4061,6 +4050,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
|
|||||||
} else {
|
} else {
|
||||||
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
|
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
|
||||||
|
|
||||||
|
ASSERT3U(queue->q_maxinflight_bytes, >, 0);
|
||||||
mutex_enter(q_lock);
|
mutex_enter(q_lock);
|
||||||
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
|
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
|
||||||
cv_wait(&queue->q_zio_cv, q_lock);
|
cv_wait(&queue->q_zio_cv, q_lock);
|
||||||
|
Loading…
Reference in New Issue
Block a user