mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 18:40:43 +03:00
Always validate checksums for Direct I/O reads
This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #16598
This commit is contained in:
+39
-5
@@ -433,7 +433,7 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = {
|
||||
};
|
||||
|
||||
raidz_row_t *
|
||||
vdev_raidz_row_alloc(int cols)
|
||||
vdev_raidz_row_alloc(int cols, zio_t *zio)
|
||||
{
|
||||
raidz_row_t *rr =
|
||||
kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
|
||||
@@ -445,7 +445,17 @@ vdev_raidz_row_alloc(int cols)
|
||||
raidz_col_t *rc = &rr->rr_col[c];
|
||||
rc->rc_shadow_devidx = INT_MAX;
|
||||
rc->rc_shadow_offset = UINT64_MAX;
|
||||
rc->rc_allow_repair = 1;
|
||||
/*
|
||||
* We can not allow self healing to take place for Direct I/O
|
||||
* reads. There is nothing that stops the buffer contents from
|
||||
* being manipulated while the I/O is in flight. It is possible
|
||||
* that the checksum could be verified on the buffer and then
|
||||
* the contents of that buffer are manipulated afterwards. This
|
||||
* could lead to bad data being written out during self
|
||||
* healing.
|
||||
*/
|
||||
if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
|
||||
rc->rc_allow_repair = 1;
|
||||
}
|
||||
return (rr);
|
||||
}
|
||||
@@ -619,7 +629,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
|
||||
}
|
||||
|
||||
ASSERT3U(acols, <=, scols);
|
||||
rr = vdev_raidz_row_alloc(scols);
|
||||
rr = vdev_raidz_row_alloc(scols, zio);
|
||||
rm->rm_row[0] = rr;
|
||||
rr->rr_cols = acols;
|
||||
rr->rr_bigcols = bc;
|
||||
@@ -765,7 +775,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio,
|
||||
|
||||
for (uint64_t row = 0; row < rows; row++) {
|
||||
boolean_t row_use_scratch = B_FALSE;
|
||||
raidz_row_t *rr = vdev_raidz_row_alloc(cols);
|
||||
raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
|
||||
rm->rm_row[row] = rr;
|
||||
|
||||
/* The starting RAIDZ (parent) vdev sector of the row. */
|
||||
@@ -2633,6 +2643,20 @@ raidz_checksum_verify(zio_t *zio)
|
||||
raidz_map_t *rm = zio->io_vsd;
|
||||
|
||||
int ret = zio_checksum_error(zio, &zbc);
|
||||
/*
|
||||
* Any Direct I/O read that has a checksum error must be treated as
|
||||
* suspicious as the contents of the buffer could be getting
|
||||
* manipulated while the I/O is taking place. The checksum verify error
|
||||
* will be reported to the top-level RAIDZ VDEV.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
|
||||
zio->io_error = ret;
|
||||
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
zio_dio_chksum_verify_error_report(zio);
|
||||
zio_checksum_verified(zio);
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (ret != 0 && zbc.zbc_injected != 0)
|
||||
rm->rm_ecksuminjected = 1;
|
||||
|
||||
@@ -2776,6 +2800,11 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
|
||||
(rc->rc_error == 0 || rc->rc_size == 0)) {
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* We do not allow self healing for Direct I/O reads.
|
||||
* See comment in vdev_raid_row_alloc().
|
||||
*/
|
||||
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
|
||||
|
||||
zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
|
||||
"offset=%llx",
|
||||
@@ -2979,6 +3008,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
|
||||
|
||||
/* Check for success */
|
||||
if (raidz_checksum_verify(zio) == 0) {
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
|
||||
return (0);
|
||||
|
||||
/* Reconstruction succeeded - report errors */
|
||||
for (int i = 0; i < rm->rm_nrows; i++) {
|
||||
@@ -3379,7 +3410,6 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
|
||||
zio_bad_cksum_t zbc;
|
||||
zbc.zbc_has_cksum = 0;
|
||||
zbc.zbc_injected = rm->rm_ecksuminjected;
|
||||
|
||||
mutex_enter(&cvd->vdev_stat_lock);
|
||||
cvd->vdev_stat.vs_checksum_errors++;
|
||||
mutex_exit(&cvd->vdev_stat_lock);
|
||||
@@ -3444,6 +3474,9 @@ vdev_raidz_io_done(zio_t *zio)
|
||||
}
|
||||
|
||||
if (raidz_checksum_verify(zio) == 0) {
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
|
||||
goto done;
|
||||
|
||||
for (int i = 0; i < rm->rm_nrows; i++) {
|
||||
raidz_row_t *rr = rm->rm_row[i];
|
||||
vdev_raidz_io_done_verified(zio, rr);
|
||||
@@ -3538,6 +3571,7 @@ vdev_raidz_io_done(zio_t *zio)
|
||||
}
|
||||
}
|
||||
}
|
||||
done:
|
||||
if (rm->rm_lr != NULL) {
|
||||
zfs_rangelock_exit(rm->rm_lr);
|
||||
rm->rm_lr = NULL;
|
||||
|
||||
Reference in New Issue
Block a user