mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Always validate checksums for Direct I/O reads
This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #16598
This commit is contained in:
+85
-35
@@ -804,11 +804,11 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
|
||||
pio->io_reexecute |= zio->io_reexecute;
|
||||
ASSERT3U(*countp, >, 0);
|
||||
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
|
||||
ASSERT3U(*errorp, ==, EIO);
|
||||
ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
/*
|
||||
* Propogate the Direct I/O checksum verify failure to the parent.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
|
||||
pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
}
|
||||
|
||||
(*countp)--;
|
||||
|
||||
@@ -1573,6 +1573,14 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
||||
*/
|
||||
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
|
||||
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
||||
/*
|
||||
* We never allow the mirror VDEV to attempt reading from any
|
||||
* additional data copies after the first Direct I/O checksum
|
||||
* verify failure. This is to avoid bad data being written out
|
||||
* through the mirror during self healing. See comment in
|
||||
* vdev_mirror_io_done() for more details.
|
||||
*/
|
||||
ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
|
||||
} else if (type == ZIO_TYPE_WRITE &&
|
||||
pio->io_prop.zp_direct_write == B_TRUE) {
|
||||
/*
|
||||
@@ -4555,18 +4563,18 @@ zio_vdev_io_assess(zio_t *zio)
|
||||
}
|
||||
|
||||
/*
|
||||
* If a Direct I/O write checksum verify error has occurred then this
|
||||
* I/O should not attempt to be issued again. Instead the EIO will
|
||||
* be returned.
|
||||
* If a Direct I/O operation has a checksum verify error then this I/O
|
||||
* should not attempt to be issued again.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
|
||||
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
ASSERT3U(zio->io_error, ==, EIO);
|
||||
if (zio->io_type == ZIO_TYPE_WRITE) {
|
||||
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
ASSERT3U(zio->io_error, ==, EIO);
|
||||
}
|
||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||
return (zio);
|
||||
}
|
||||
|
||||
|
||||
if (zio_injection_enabled && zio->io_error == 0)
|
||||
zio->io_error = zio_handle_fault_injection(zio, EIO);
|
||||
|
||||
@@ -4864,16 +4872,40 @@ zio_checksum_verify(zio_t *zio)
|
||||
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
|
||||
}
|
||||
|
||||
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
|
||||
IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
|
||||
!(zio->io_flags & ZIO_FLAG_SPECULATIVE));
|
||||
|
||||
if ((error = zio_checksum_error(zio, &info)) != 0) {
|
||||
zio->io_error = error;
|
||||
if (error == ECKSUM &&
|
||||
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_checksum_errors++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
(void) zfs_ereport_start_checksum(zio->io_spa,
|
||||
zio->io_vd, &zio->io_bookmark, zio,
|
||||
zio->io_offset, zio->io_size, &info);
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_READ) {
|
||||
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
zio_t *pio = zio_unique_parent(zio);
|
||||
/*
|
||||
* Any Direct I/O read that has a checksum
|
||||
* error must be treated as suspicous as the
|
||||
* contents of the buffer could be getting
|
||||
* manipulated while the I/O is taking place.
|
||||
*
|
||||
* The checksum verify error will only be
|
||||
* reported here for disk and file VDEV's and
|
||||
* will be reported on those that the failure
|
||||
* occurred on. Other types of VDEV's report the
|
||||
* verify failure in their own code paths.
|
||||
*/
|
||||
if (pio->io_child_type == ZIO_CHILD_LOGICAL) {
|
||||
zio_dio_chksum_verify_error_report(zio);
|
||||
}
|
||||
} else {
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_checksum_errors++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
(void) zfs_ereport_start_checksum(zio->io_spa,
|
||||
zio->io_vd, &zio->io_bookmark, zio,
|
||||
zio->io_offset, zio->io_size, &info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4899,22 +4931,8 @@ zio_dio_checksum_verify(zio_t *zio)
|
||||
if ((error = zio_checksum_error(zio, NULL)) != 0) {
|
||||
zio->io_error = error;
|
||||
if (error == ECKSUM) {
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_dio_verify_errors++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_error = SET_ERROR(EIO);
|
||||
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
|
||||
/*
|
||||
* The EIO error must be propagated up to the logical
|
||||
* parent ZIO in zio_notify_parent() so it can be
|
||||
* returned to dmu_write_abd().
|
||||
*/
|
||||
zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
|
||||
|
||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark,
|
||||
zio, 0);
|
||||
zio_dio_chksum_verify_error_report(zio);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4932,6 +4950,39 @@ zio_checksum_verified(zio_t *zio)
|
||||
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Report Direct I/O checksum verify error and create ZED event.
|
||||
*/
|
||||
void
|
||||
zio_dio_chksum_verify_error_report(zio_t *zio)
|
||||
{
|
||||
ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
|
||||
|
||||
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
|
||||
return;
|
||||
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_dio_verify_errors++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
if (zio->io_type == ZIO_TYPE_WRITE) {
|
||||
/*
|
||||
* Convert checksum error for writes into EIO.
|
||||
*/
|
||||
zio->io_error = SET_ERROR(EIO);
|
||||
/*
|
||||
* Report dio_verify_wr ZED event.
|
||||
*/
|
||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
|
||||
} else {
|
||||
/*
|
||||
* Report dio_verify_rd ZED event.
|
||||
*/
|
||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ==========================================================================
|
||||
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
|
||||
@@ -5343,10 +5394,9 @@ zio_done(zio_t *zio)
|
||||
|
||||
if (zio->io_reexecute) {
|
||||
/*
|
||||
* A Direct I/O write that has a checksum verify error should
|
||||
* not attempt to reexecute. Instead, EAGAIN should just be
|
||||
* propagated back up so the write can be attempt to be issued
|
||||
* through the ARC.
|
||||
* A Direct I/O operation that has a checksum verify error
|
||||
* should not attempt to reexecute. Instead, the error should
|
||||
* just be propagated back.
|
||||
*/
|
||||
ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user