Remove ARC/ZIO physdone callbacks.

Those callbacks were introduced many years ago as part of a bigger
patch to smoothen the write throttling within a txg. They allow to
account completion of individual physical writes within a logical
one, improving cases when some of physical writes complete much
sooner than others, gradually opening the write throttle.

Few years after that ZFS got allocation throttling, working on a
level of logical writes and limiting number of writes queued to
vdevs at any point, and so limiting latency distribution between
the physical writes and especially writes of multiple copies.
The addition of scheduling deadline I proposed in #14925 should
further reduce the latency distribution.  Grown memory sizes over
the past 10 years should also reduce importance of the smoothing.

While the use of physdone callback may still in theory provide
some smoother throttling, there are cases where we simply can not
afford it.  Since dirty data accounting is protected by pool-wide
lock, in case of 6-wide RAIDZ, for example, it requires us to take
it 8 times per logical block write, creating huge lock contention.

My tests of this patch show radical reduction of the lock spinning
time on workloads when smaller blocks are written to RAIDZ pools,
when each of the disks receives 8-16KB chunks, but the total rate
reaching 100K+ blocks per second.  Same time attempts to measure
any write time fluctuations didn't show anything noticeable.

While there, remove also io_child_count/io_parent_count counters.
They are used only for couple assertions that can be avoided.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14948
This commit is contained in:
Alexander Motin 2023-06-15 13:49:03 -04:00 committed by GitHub
parent e32e326c5b
commit ccec7fbe1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 26 additions and 143 deletions

View File

@ -304,9 +304,8 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
arc_write_done_func_t *done, void *priv, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb);
arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
void arc_remove_prune_callback(arc_prune_t *p);

View File

@ -123,7 +123,6 @@ struct arc_write_callback {
void *awcb_private;
arc_write_done_func_t *awcb_ready;
arc_write_done_func_t *awcb_children_ready;
arc_write_done_func_t *awcb_physdone;
arc_write_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};

View File

@ -460,7 +460,6 @@ struct zio {
/* Callback info */
zio_done_func_t *io_ready;
zio_done_func_t *io_children_ready;
zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
@ -503,9 +502,6 @@ struct zio {
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t io_child_count;
uint64_t io_phys_children;
uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@ -553,9 +549,8 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *priv, zio_priority_t priority, zio_flag_t flags,
const zbookmark_phys_t *zb);
zio_done_func_t *done, void *priv, zio_priority_t priority,
zio_flag_t flags, const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,

View File

@ -6675,18 +6675,6 @@ arc_write_children_ready(zio_t *zio)
callback->awcb_children_ready(zio, buf, callback->awcb_private);
}
/*
* The SPA calls this callback for each physical write that happens on behalf
* of a logical write. See the comment in dbuf_write_physdone() for details.
*/
static void
arc_write_physdone(zio_t *zio)
{
arc_write_callback_t *cb = zio->io_private;
if (cb->awcb_physdone != NULL)
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
}
static void
arc_write_done(zio_t *zio)
{
@ -6776,9 +6764,9 @@ zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
const zio_prop_t *zp, arc_write_done_func_t *ready,
arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@ -6825,7 +6813,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
@ -6862,8 +6849,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
priority, zio_flags, zb);
arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}

View File

@ -4369,22 +4369,6 @@ dbuf_lightweight_ready(zio_t *zio)
rw_exit(&parent_db->db_rwlock);
}
static void
dbuf_lightweight_physdone(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dbuf_lightweight_done().
*/
int delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}
static void
dbuf_lightweight_done(zio_t *zio)
{
@ -4403,16 +4387,8 @@ dbuf_lightweight_done(zio_t *zio)
dsl_dataset_block_born(ds, zio->io_bp, tx);
}
/*
* See comment in dbuf_write_done().
*/
if (zio->io_phys_children == 0) {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted, zio->io_txg);
} else {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
}
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
zio->io_txg);
abd_free(dr->dt.dll.dr_abd);
kmem_free(dr, sizeof (*dr));
@ -4446,8 +4422,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
ZIO_PRIORITY_ASYNC_WRITE,
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
zio_nowait(dr->dr_zio);
@ -4789,37 +4764,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
DB_DNODE_EXIT(db);
}
/*
* The SPA will call this callback several times for each zio - once
* for every physical child i/o (zio->io_phys_children times). This
* allows the DMU to monitor the progress of each logical i/o. For example,
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
* block. There may be a long delay before all copies/fragments are completed,
* so this callback allows us to retire dirty space gradually, as the physical
* i/os complete.
*/
static void
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
{
(void) buf;
dmu_buf_impl_t *db = arg;
objset_t *os = db->db_objset;
dsl_pool_t *dp = dmu_objset_pool(os);
dbuf_dirty_record_t *dr;
int delta = 0;
dr = db->db_data_pending;
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
/*
* The callback will be called io_phys_children times. Retire one
* portion of our dirty space each time we are called. Any rounding
* error will be cleaned up by dbuf_write_done().
*/
delta = dr->dr_accounted / zio->io_phys_children;
dsl_pool_undirty_space(dp, delta, zio->io_txg);
}
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
@ -4894,27 +4838,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
/*
* If we didn't do a physical write in this ZIO and we
* still ended up here, it means that the space of the
* dbuf that we just released (and undirtied) above hasn't
* been marked as undirtied in the pool's accounting.
*
* Thus, we undirty that space in the pool's view of the
* world here. For physical writes this type of update
* happens in dbuf_write_physdone().
*
* If we did a physical write, cleanup any rounding errors
* that came up due to writing multiple copies of a block
* on disk [see dbuf_write_physdone()].
*/
if (zio->io_phys_children == 0) {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted, zio->io_txg);
} else {
dsl_pool_undirty_space(dmu_objset_pool(os),
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
}
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
zio->io_txg);
kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
@ -5162,7 +5087,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
dbuf_write_override_ready, NULL, NULL,
dbuf_write_override_ready, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
@ -5176,7 +5101,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
dbuf_write_nofill_ready, NULL, NULL,
dbuf_write_nofill_ready, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@ -5195,9 +5120,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = arc_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
children_ready_cb, dbuf_write_physdone,
dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);
children_ready_cb, dbuf_write_done, db,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}

View File

@ -1698,7 +1698,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
@ -1864,7 +1864,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);

View File

@ -1698,7 +1698,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
zio = arc_write(pio, os->os_spa, tx->tx_txg,
blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*

View File

@ -650,9 +650,6 @@ zio_add_child(zio_t *pio, zio_t *cio)
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
pio->io_child_count++;
cio->io_parent_count++;
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
}
@ -669,9 +666,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
pio->io_child_count--;
cio->io_parent_count--;
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
@ -1162,9 +1156,8 @@ zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, zio_flag_t flags,
const zbookmark_phys_t *zb)
zio_done_func_t *done, void *private, zio_priority_t priority,
zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
@ -1184,7 +1177,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_ready = ready;
zio->io_children_ready = children_ready;
zio->io_physdone = physdone;
zio->io_prop = *zp;
/*
@ -1517,16 +1509,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
zio->io_physdone = pio->io_physdone;
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
zio->io_logical->io_phys_children++;
return (zio);
}
@ -2711,7 +2698,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
ASSERT(zio->io_child_count == 0);
ASSERT(list_is_empty(&zio->io_child_list));
if (zio->io_error)
return;
@ -2969,7 +2956,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
resid) : NULL, lsize, lsize, &zp,
zio_write_gang_member_ready, NULL, NULL,
zio_write_gang_member_ready, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
@ -3431,7 +3418,7 @@ zio_ddt_write(zio_t *zio)
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@ -4134,13 +4121,6 @@ zio_vdev_io_assess(zio_t *zio)
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
zio->io_physdone != NULL) {
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
zio->io_physdone(zio->io_logical);
}
return (zio);
}
@ -4890,7 +4870,7 @@ zio_done(zio_t *zio)
return (NULL);
}
ASSERT(zio->io_child_count == 0);
ASSERT(list_is_empty(&zio->io_child_list));
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));