mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-04-06 17:49:11 +03:00
Single IO issue for raidz writes with skip sector
In order to reduce contention on the vq_lock, optional skip sectors for Raidz writes can be placed into a single IO request. This is done by padding out the linear ABD for a parity column to contain the skip sector and by creating gang ABD to contain the data and skip sector for data columns. The vdev_raidz_map_alloc() function now contains specific functions for both reads and write to allocate the ABD's that will be issued down to the VDEV chldren. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Reviewed-By: Mark Maybee <mark.maybee@delphix.com> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #12333
This commit is contained in:
parent
453c63e9b7
commit
345196be18
@ -174,6 +174,114 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = {
|
|||||||
.vsd_free = vdev_raidz_map_free_vsd,
|
.vsd_free = vdev_raidz_map_free_vsd,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
|
||||||
|
{
|
||||||
|
int c;
|
||||||
|
int nwrapped = 0;
|
||||||
|
uint64_t off = 0;
|
||||||
|
raidz_row_t *rr = rm->rm_row[0];
|
||||||
|
|
||||||
|
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
|
||||||
|
ASSERT3U(rm->rm_nrows, ==, 1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pad any parity columns with additional space to account for skip
|
||||||
|
* sectors.
|
||||||
|
*/
|
||||||
|
if (rm->rm_skipstart < rr->rr_firstdatacol) {
|
||||||
|
ASSERT0(rm->rm_skipstart);
|
||||||
|
nwrapped = rm->rm_nskip;
|
||||||
|
} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
|
||||||
|
nwrapped =
|
||||||
|
(rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optional single skip sectors (rc_size == 0) will be handled in
|
||||||
|
* vdev_raidz_io_start_write().
|
||||||
|
*/
|
||||||
|
int skipped = rr->rr_scols - rr->rr_cols;
|
||||||
|
|
||||||
|
/* Allocate buffers for the parity columns */
|
||||||
|
for (c = 0; c < rr->rr_firstdatacol; c++) {
|
||||||
|
raidz_col_t *rc = &rr->rr_col[c];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Parity columns will pad out a linear ABD to account for
|
||||||
|
* the skip sector. A linear ABD is used here because
|
||||||
|
* parity calculations use the ABD buffer directly to calculate
|
||||||
|
* parity. This avoids doing a memcpy back to the ABD after the
|
||||||
|
* parity has been calculated. By issuing the parity column
|
||||||
|
* with the skip sector we can reduce contention on the child
|
||||||
|
* VDEV queue locks (vq_lock).
|
||||||
|
*/
|
||||||
|
if (c < nwrapped) {
|
||||||
|
rc->rc_abd = abd_alloc_linear(
|
||||||
|
rc->rc_size + (1ULL << ashift), B_FALSE);
|
||||||
|
abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
|
||||||
|
skipped++;
|
||||||
|
} else {
|
||||||
|
rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (off = 0; c < rr->rr_cols; c++) {
|
||||||
|
raidz_col_t *rc = &rr->rr_col[c];
|
||||||
|
abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
|
||||||
|
zio->io_abd, off, rc->rc_size);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generate I/O for skip sectors to improve aggregation
|
||||||
|
* continuity. We will use gang ABD's to reduce contention
|
||||||
|
* on the child VDEV queue locks (vq_lock) by issuing
|
||||||
|
* a single I/O that contains the data and skip sector.
|
||||||
|
*
|
||||||
|
* It is important to make sure that rc_size is not updated
|
||||||
|
* even though we are adding a skip sector to the ABD. When
|
||||||
|
* calculating the parity in vdev_raidz_generate_parity_row()
|
||||||
|
* the rc_size is used to iterate through the ABD's. We can
|
||||||
|
* not have zero'd out skip sectors used for calculating
|
||||||
|
* parity for raidz, because those same sectors are not used
|
||||||
|
* during reconstruction.
|
||||||
|
*/
|
||||||
|
if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
|
||||||
|
rc->rc_abd = abd_alloc_gang();
|
||||||
|
abd_gang_add(rc->rc_abd, abd, B_TRUE);
|
||||||
|
abd_gang_add(rc->rc_abd,
|
||||||
|
abd_get_zeros(1ULL << ashift), B_TRUE);
|
||||||
|
skipped++;
|
||||||
|
} else {
|
||||||
|
rc->rc_abd = abd;
|
||||||
|
}
|
||||||
|
off += rc->rc_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT3U(off, ==, zio->io_size);
|
||||||
|
ASSERT3S(skipped, ==, rm->rm_nskip);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
|
||||||
|
{
|
||||||
|
int c;
|
||||||
|
raidz_row_t *rr = rm->rm_row[0];
|
||||||
|
|
||||||
|
ASSERT3U(rm->rm_nrows, ==, 1);
|
||||||
|
|
||||||
|
/* Allocate buffers for the parity columns */
|
||||||
|
for (c = 0; c < rr->rr_firstdatacol; c++)
|
||||||
|
rr->rr_col[c].rc_abd =
|
||||||
|
abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
|
||||||
|
|
||||||
|
for (uint64_t off = 0; c < rr->rr_cols; c++) {
|
||||||
|
raidz_col_t *rc = &rr->rr_col[c];
|
||||||
|
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
|
||||||
|
zio->io_abd, off, rc->rc_size);
|
||||||
|
off += rc->rc_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Divides the IO evenly across all child vdevs; usually, dcols is
|
* Divides the IO evenly across all child vdevs; usually, dcols is
|
||||||
* the number of children in the target vdev.
|
* the number of children in the target vdev.
|
||||||
@ -287,17 +395,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
|
|||||||
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
|
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
|
||||||
rm->rm_skipstart = bc;
|
rm->rm_skipstart = bc;
|
||||||
|
|
||||||
for (c = 0; c < rr->rr_firstdatacol; c++)
|
|
||||||
rr->rr_col[c].rc_abd =
|
|
||||||
abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
|
|
||||||
|
|
||||||
for (uint64_t off = 0; c < acols; c++) {
|
|
||||||
raidz_col_t *rc = &rr->rr_col[c];
|
|
||||||
rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
|
|
||||||
zio->io_abd, off, rc->rc_size);
|
|
||||||
off += rc->rc_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If all data stored spans all columns, there's a danger that parity
|
* If all data stored spans all columns, there's a danger that parity
|
||||||
* will always be on the same device and, since parity isn't read
|
* will always be on the same device and, since parity isn't read
|
||||||
@ -333,6 +430,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
|
|||||||
rm->rm_skipstart = 1;
|
rm->rm_skipstart = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (zio->io_type == ZIO_TYPE_WRITE) {
|
||||||
|
vdev_raidz_map_alloc_write(zio, rm, ashift);
|
||||||
|
} else {
|
||||||
|
vdev_raidz_map_alloc_read(zio, rm);
|
||||||
|
}
|
||||||
|
|
||||||
/* init RAIDZ parity ops */
|
/* init RAIDZ parity ops */
|
||||||
rm->rm_ops = vdev_raidz_math_get_ops();
|
rm->rm_ops = vdev_raidz_math_get_ops();
|
||||||
|
|
||||||
@ -1482,6 +1585,7 @@ vdev_raidz_child_done(zio_t *zio)
|
|||||||
{
|
{
|
||||||
raidz_col_t *rc = zio->io_private;
|
raidz_col_t *rc = zio->io_private;
|
||||||
|
|
||||||
|
ASSERT3P(rc->rc_abd, !=, NULL);
|
||||||
rc->rc_error = zio->io_error;
|
rc->rc_error = zio->io_error;
|
||||||
rc->rc_tried = 1;
|
rc->rc_tried = 1;
|
||||||
rc->rc_skipped = 0;
|
rc->rc_skipped = 0;
|
||||||
@ -1525,40 +1629,34 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
|
|||||||
{
|
{
|
||||||
vdev_t *vd = zio->io_vd;
|
vdev_t *vd = zio->io_vd;
|
||||||
raidz_map_t *rm = zio->io_vsd;
|
raidz_map_t *rm = zio->io_vsd;
|
||||||
int c, i;
|
|
||||||
|
|
||||||
vdev_raidz_generate_parity_row(rm, rr);
|
vdev_raidz_generate_parity_row(rm, rr);
|
||||||
|
|
||||||
for (int c = 0; c < rr->rr_cols; c++) {
|
for (int c = 0; c < rr->rr_scols; c++) {
|
||||||
raidz_col_t *rc = &rr->rr_col[c];
|
raidz_col_t *rc = &rr->rr_col[c];
|
||||||
if (rc->rc_size == 0)
|
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
|
||||||
continue;
|
|
||||||
|
|
||||||
/* Verify physical to logical translation */
|
/* Verify physical to logical translation */
|
||||||
vdev_raidz_io_verify(vd, rr, c);
|
vdev_raidz_io_verify(vd, rr, c);
|
||||||
|
|
||||||
zio_nowait(zio_vdev_child_io(zio, NULL,
|
if (rc->rc_size > 0) {
|
||||||
vd->vdev_child[rc->rc_devidx], rc->rc_offset,
|
ASSERT3P(rc->rc_abd, !=, NULL);
|
||||||
rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
|
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
|
||||||
0, vdev_raidz_child_done, rc));
|
rc->rc_offset, rc->rc_abd,
|
||||||
}
|
abd_get_size(rc->rc_abd), zio->io_type,
|
||||||
|
zio->io_priority, 0, vdev_raidz_child_done, rc));
|
||||||
/*
|
} else {
|
||||||
* Generate optional I/Os for skip sectors to improve aggregation
|
/*
|
||||||
* contiguity.
|
* Generate optional write for skip sector to improve
|
||||||
*/
|
* aggregation contiguity.
|
||||||
for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
|
*/
|
||||||
ASSERT(c <= rr->rr_scols);
|
ASSERT3P(rc->rc_abd, ==, NULL);
|
||||||
if (c == rr->rr_scols)
|
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
|
||||||
c = 0;
|
rc->rc_offset, NULL, 1ULL << ashift,
|
||||||
|
zio->io_type, zio->io_priority,
|
||||||
raidz_col_t *rc = &rr->rr_col[c];
|
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
|
||||||
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
|
NULL));
|
||||||
|
}
|
||||||
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
|
|
||||||
rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
|
|
||||||
zio->io_type, zio->io_priority,
|
|
||||||
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user