Always validate checksums for Direct I/O reads

This fixes an oversight in the Direct I/O PR. There is nothing that
stops a process from manipulating the contents of a buffer for a
Direct I/O read while the I/O is in flight. This can lead checksum
verify failures. However, the disk contents are still correct, and this
would lead to false reporting of checksum validation failures.

To remedy this, all Direct I/O reads that have a checksum verification
failure are treated as suspicious. In the event a checksum validation
failure occurs for a Direct I/O read, then the I/O request will be
reissued though the ARC. This allows for actual validation to happen and
removes any possibility of the buffer being manipulated after the I/O
has been issued.

Just as with Direct I/O write checksum validation failures, Direct I/O
read checksum validation failures are reported though zpool status -d in
the DIO column. Also the zevent has been updated to have both:
1. dio_verify_wr -> Checksum verification failure for writes
2. dio_verify_rd -> Checksum verification failure for reads.
This allows for determining what I/O operation was the culprit for the
checksum verification failure. All DIO errors are reported only on the
top-level VDEV.

Even though FreeBSD can write protect pages (stable pages) it still has
the same issue as Linux with Direct I/O reads.

This commit updates the following:
1. Propogates checksum failures for reads all the way up to the
   top-level VDEV.
2. Reports errors through zpool status -d as DIO.
3. Has two zevents for checksum verify errors with Direct I/O. One for
   read and one for write.
4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and
   handle ABD buffer contents validation the same as Linux.
5. Updated manipulate_user_buffer.c to also manipulate a buffer while a
   Direct I/O read is taking place.
6. Adds a new ZTS test case dio_read_verify that stress tests the new
   code.
7. Updated man pages.
8. Added an IMPLY statement to zio_checksum_verify() to make sure that
   Direct I/O reads are not issued as speculative.
9. Removed self healing through mirror, raidz, and dRAID VDEVs for
   Direct I/O reads.

This issue was first observed when installing a Windows 11 VM on a ZFS
dataset with the dataset property direct set to always. The zpool
devices would report checksum failures, but running a subsequent zpool
scrub would not repair any data and report no errors.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #16598
This commit is contained in:
Brian Atkinson 2024-10-09 15:28:08 -04:00 committed by GitHub
parent efeb60b86a
commit b4e4cbeb20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 510 additions and 146 deletions

View File

@ -9224,6 +9224,12 @@ vdev_stats_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
} }
} }
if (cb->cb_print_dio_verify) {
nice_num_str_nvlist(vds, "dio_verify_errors",
vs->vs_dio_verify_errors, cb->cb_literal,
cb->cb_json_as_int, ZFS_NICENUM_1024);
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&notpresent) == 0) { &notpresent) == 0) {
nice_num_str_nvlist(vds, ZPOOL_CONFIG_NOT_PRESENT, nice_num_str_nvlist(vds, ZPOOL_CONFIG_NOT_PRESENT,

View File

@ -42,7 +42,8 @@ extern "C" {
#define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DATA "data"
#define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DELAY "delay"
#define FM_EREPORT_ZFS_DEADMAN "deadman" #define FM_EREPORT_ZFS_DEADMAN "deadman"
#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify" #define FM_EREPORT_ZFS_DIO_VERIFY_WR "dio_verify_wr"
#define FM_EREPORT_ZFS_DIO_VERIFY_RD "dio_verify_rd"
#define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_POOL "zpool"
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"

View File

@ -57,7 +57,7 @@ void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
void vdev_raidz_child_done(zio_t *); void vdev_raidz_child_done(zio_t *);
void vdev_raidz_io_done(zio_t *); void vdev_raidz_io_done(zio_t *);
void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *); void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *); void raidz_dtl_reassessed(vdev_t *);

View File

@ -208,25 +208,25 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_PROBE (1ULL << 16) #define ZIO_FLAG_PROBE (1ULL << 16)
#define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_TRYHARD (1ULL << 17)
#define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_OPTIONAL (1ULL << 18)
#define ZIO_FLAG_DIO_READ (1ULL << 19)
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/* /*
* Flags not inherited by any children. * Flags not inherited by any children.
*/ */
#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */ #define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */
#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20) #define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21)
#define ZIO_FLAG_IO_BYPASS (1ULL << 21) #define ZIO_FLAG_IO_BYPASS (1ULL << 22)
#define ZIO_FLAG_IO_REWRITE (1ULL << 22) #define ZIO_FLAG_IO_REWRITE (1ULL << 23)
#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23) #define ZIO_FLAG_RAW_COMPRESS (1ULL << 24)
#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24) #define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25)
#define ZIO_FLAG_GANG_CHILD (1ULL << 25) #define ZIO_FLAG_GANG_CHILD (1ULL << 26)
#define ZIO_FLAG_DDT_CHILD (1ULL << 26) #define ZIO_FLAG_DDT_CHILD (1ULL << 27)
#define ZIO_FLAG_GODFATHER (1ULL << 27) #define ZIO_FLAG_GODFATHER (1ULL << 28)
#define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_NOPWRITE (1ULL << 29)
#define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_REEXECUTED (1ULL << 30)
#define ZIO_FLAG_DELEGATED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31)
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31) #define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)
#define ZIO_ALLOCATOR_NONE (-1) #define ZIO_ALLOCATOR_NONE (-1)
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
@ -647,6 +647,7 @@ extern void zio_vdev_io_redone(zio_t *zio);
extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
extern void zio_checksum_verified(zio_t *zio); extern void zio_checksum_verified(zio_t *zio);
extern void zio_dio_chksum_verify_error_report(zio_t *zio);
extern int zio_worst_error(int e1, int e2); extern int zio_worst_error(int e1, int e2);
extern enum zio_checksum zio_checksum_select(enum zio_checksum child, extern enum zio_checksum zio_checksum_select(enum zio_checksum child,

View File

@ -436,7 +436,7 @@ write.
It can also help to identify if reported checksum errors are tied to Direct I/O It can also help to identify if reported checksum errors are tied to Direct I/O
writes. writes.
Each verify error causes a Each verify error causes a
.Sy dio_verify .Sy dio_verify_wr
zevent. zevent.
Direct Write I/O checkum verify errors can be seen with Direct Write I/O checkum verify errors can be seen with
.Nm zpool Cm status Fl d . .Nm zpool Cm status Fl d .

View File

@ -98,7 +98,10 @@ This can be an indicator of problems with the underlying storage device.
The number of delay events is ratelimited by the The number of delay events is ratelimited by the
.Sy zfs_slow_io_events_per_second .Sy zfs_slow_io_events_per_second
module parameter. module parameter.
.It Sy dio_verify .It Sy dio_verify_rd
Issued when there was a checksum verify error after a Direct I/O read has been
issued.
.It Sy dio_verify_wr
Issued when there was a checksum verify error after a Direct I/O write has been Issued when there was a checksum verify error after a Direct I/O write has been
issued. issued.
This event can only take place if the module parameter This event can only take place if the module parameter

View File

@ -82,14 +82,18 @@ Specify
.Sy --json-pool-key-guid .Sy --json-pool-key-guid
to set pool GUID as key for pool objects instead of pool names. to set pool GUID as key for pool objects instead of pool names.
.It Fl d .It Fl d
Display the number of Direct I/O write checksum verify errors that have occured Display the number of Direct I/O read/write checksum verify errors that have
on a top-level VDEV. occured on a top-level VDEV.
See See
.Sx zfs_vdev_direct_write_verify .Sx zfs_vdev_direct_write_verify
in in
.Xr zfs 4 .Xr zfs 4
for details about the conditions that can cause Direct I/O write checksum for details about the conditions that can cause Direct I/O write checksum
verify failures to occur. verify failures to occur.
Direct I/O reads checksum verify errors can also occur if the contents of the
buffer are being manipulated after the I/O has been issued and is in flight.
In the case of Direct I/O read checksum verify errors, the I/O will be reissued
through the ARC.
.It Fl D .It Fl D
Display a histogram of deduplication statistics, showing the allocated Display a histogram of deduplication statistics, showing the allocated
.Pq physically present on disk .Pq physically present on disk

View File

@ -620,9 +620,16 @@ abd_borrow_buf_copy(abd_t *abd, size_t n)
/* /*
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
* no change the contents of the ABD and will ASSERT that you didn't modify * not change the contents of the ABD. If you want any changes you made to
* the buffer since it was borrowed. If you want any changes you made to buf to * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
* be copied back to abd, use abd_return_buf_copy() instead. * ABD is not constructed from user pages from Direct I/O then an ASSERT
* checks to make sure the contents of the buffer have not changed since it was
* borrowed. We can not ASSERT the contents of the buffer have not changed if
* it is composed of user pages. While Direct I/O write pages are placed under
* write protection and can not be changed, this is not the case for Direct I/O
* reads. The pages of a Direct I/O read could be manipulated at any time.
* Checksum verifications in the ZIO pipeline check for this issue and handle
* it by returning an error on checksum verification failure.
*/ */
void void
abd_return_buf(abd_t *abd, void *buf, size_t n) abd_return_buf(abd_t *abd, void *buf, size_t n)
@ -632,8 +639,34 @@ abd_return_buf(abd_t *abd, void *buf, size_t n)
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf); (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
#endif #endif
if (abd_is_linear(abd)) { if (abd_is_from_pages(abd)) {
if (!abd_is_linear_page(abd))
zio_buf_free(buf, n);
} else if (abd_is_linear(abd)) {
ASSERT3P(buf, ==, abd_to_buf(abd)); ASSERT3P(buf, ==, abd_to_buf(abd));
} else if (abd_is_gang(abd)) {
#ifdef ZFS_DEBUG
/*
* We have to be careful with gang ABD's that we do not ASSERT
* for any ABD's that contain user pages from Direct I/O. See
* the comment above about Direct I/O read buffers possibly
* being manipulated. In order to handle this, we jsut iterate
* through the gang ABD and only verify ABD's that are not from
* user pages.
*/
void *cmp_buf = buf;
for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
cabd != NULL;
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
if (!abd_is_from_pages(cabd)) {
ASSERT0(abd_cmp_buf(cabd, cmp_buf,
cabd->abd_size));
}
cmp_buf = (char *)cmp_buf + cabd->abd_size;
}
#endif
zio_buf_free(buf, n);
} else { } else {
ASSERT0(abd_cmp_buf(abd, buf, n)); ASSERT0(abd_cmp_buf(abd, buf, n));
zio_buf_free(buf, n); zio_buf_free(buf, n);

View File

@ -1008,7 +1008,9 @@ abd_borrow_buf_copy(abd_t *abd, size_t n)
* borrowed. We can not ASSERT that the contents of the buffer have not changed * borrowed. We can not ASSERT that the contents of the buffer have not changed
* if it is composed of user pages because the pages can not be placed under * if it is composed of user pages because the pages can not be placed under
* write protection and the user could have possibly changed the contents in * write protection and the user could have possibly changed the contents in
* the pages at any time. * the pages at any time. This is also an issue for Direct I/O reads. Checksum
* verifications in the ZIO pipeline check for this issue and handle it by
* returning an error on checksum verification failure.
*/ */
void void
abd_return_buf(abd_t *abd, void *buf, size_t n) abd_return_buf(abd_t *abd, void *buf, size_t n)

View File

@ -206,6 +206,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag,
{ '.', "PR", "PROBE" }, { '.', "PR", "PROBE" },
{ '.', "TH", "TRYHARD" }, { '.', "TH", "TRYHARD" },
{ '.', "OP", "OPTIONAL" }, { '.', "OP", "OPTIONAL" },
{ '.', "RD", "DIO_READ" },
{ '.', "DQ", "DONT_QUEUE" }, { '.', "DQ", "DONT_QUEUE" },
{ '.', "DP", "DONT_PROPAGATE" }, { '.', "DP", "DONT_PROPAGATE" },
{ '.', "BY", "IO_BYPASS" }, { '.', "BY", "IO_BYPASS" },

View File

@ -330,7 +330,7 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
*/ */
zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL, &zb); ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
zfs_racct_read(spa, db->db.db_size, 1, flags); zfs_racct_read(spa, db->db.db_size, 1, flags);

View File

@ -1026,7 +1026,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
ASSERT3U(vdc->vdc_nparity, >, 0); ASSERT3U(vdc->vdc_nparity, >, 0);
raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio);
rr->rr_bigcols = bc; rr->rr_bigcols = bc;
rr->rr_firstdatacol = vdc->vdc_nparity; rr->rr_firstdatacol = vdc->vdc_nparity;
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG

View File

@ -34,6 +34,7 @@
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/abd.h> #include <sys/abd.h>
#include <sys/zthr.h> #include <sys/zthr.h>
#include <sys/fm/fs/zfs.h>
/* /*
* An indirect vdev corresponds to a vdev that has been removed. Since * An indirect vdev corresponds to a vdev that has been removed. Since
@ -1832,6 +1833,19 @@ vdev_indirect_io_done(zio_t *zio)
zio_bad_cksum_t zbc; zio_bad_cksum_t zbc;
int ret = zio_checksum_error(zio, &zbc); int ret = zio_checksum_error(zio, &zbc);
/*
* Any Direct I/O read that has a checksum error must be treated as
* suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level VDEV.
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
ret = 0;
}
if (ret == 0) { if (ret == 0) {
zio_checksum_verified(zio); zio_checksum_verified(zio);
return; return;

View File

@ -764,6 +764,27 @@ vdev_mirror_io_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_READ); ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
* Any Direct I/O read that has a checksum error must be treated as
* suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level Mirror VDEV.
*
* There will be no attampt at reading any additional data copies. If
* the buffer is still being manipulated while attempting to read from
* another child, there exists a possibly that the checksum could be
* verified as valid. However, the buffer contents could again get
* manipulated after verifying the checksum. This would lead to bad data
* being written out during self healing.
*/
if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
zio_dio_chksum_verify_error_report(zio);
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT3U(zio->io_error, ==, ECKSUM);
return;
}
/* /*
* If we don't have a good copy yet, keep trying other children. * If we don't have a good copy yet, keep trying other children.
*/ */

View File

@ -433,7 +433,7 @@ const zio_vsd_ops_t vdev_raidz_vsd_ops = {
}; };
raidz_row_t * raidz_row_t *
vdev_raidz_row_alloc(int cols) vdev_raidz_row_alloc(int cols, zio_t *zio)
{ {
raidz_row_t *rr = raidz_row_t *rr =
kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
@ -445,7 +445,17 @@ vdev_raidz_row_alloc(int cols)
raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *rc = &rr->rr_col[c];
rc->rc_shadow_devidx = INT_MAX; rc->rc_shadow_devidx = INT_MAX;
rc->rc_shadow_offset = UINT64_MAX; rc->rc_shadow_offset = UINT64_MAX;
rc->rc_allow_repair = 1; /*
* We can not allow self healing to take place for Direct I/O
* reads. There is nothing that stops the buffer contents from
* being manipulated while the I/O is in flight. It is possible
* that the checksum could be verified on the buffer and then
* the contents of that buffer are manipulated afterwards. This
* could lead to bad data being written out during self
* healing.
*/
if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
rc->rc_allow_repair = 1;
} }
return (rr); return (rr);
} }
@ -619,7 +629,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
} }
ASSERT3U(acols, <=, scols); ASSERT3U(acols, <=, scols);
rr = vdev_raidz_row_alloc(scols); rr = vdev_raidz_row_alloc(scols, zio);
rm->rm_row[0] = rr; rm->rm_row[0] = rr;
rr->rr_cols = acols; rr->rr_cols = acols;
rr->rr_bigcols = bc; rr->rr_bigcols = bc;
@ -765,7 +775,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio,
for (uint64_t row = 0; row < rows; row++) { for (uint64_t row = 0; row < rows; row++) {
boolean_t row_use_scratch = B_FALSE; boolean_t row_use_scratch = B_FALSE;
raidz_row_t *rr = vdev_raidz_row_alloc(cols); raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
rm->rm_row[row] = rr; rm->rm_row[row] = rr;
/* The starting RAIDZ (parent) vdev sector of the row. */ /* The starting RAIDZ (parent) vdev sector of the row. */
@ -2633,6 +2643,20 @@ raidz_checksum_verify(zio_t *zio)
raidz_map_t *rm = zio->io_vsd; raidz_map_t *rm = zio->io_vsd;
int ret = zio_checksum_error(zio, &zbc); int ret = zio_checksum_error(zio, &zbc);
/*
* Any Direct I/O read that has a checksum error must be treated as
* suspicious as the contents of the buffer could be getting
* manipulated while the I/O is taking place. The checksum verify error
* will be reported to the top-level RAIDZ VDEV.
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
zio_checksum_verified(zio);
return (0);
}
if (ret != 0 && zbc.zbc_injected != 0) if (ret != 0 && zbc.zbc_injected != 0)
rm->rm_ecksuminjected = 1; rm->rm_ecksuminjected = 1;
@ -2776,6 +2800,11 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
(rc->rc_error == 0 || rc->rc_size == 0)) { (rc->rc_error == 0 || rc->rc_size == 0)) {
continue; continue;
} }
/*
* We do not allow self healing for Direct I/O reads.
* See comment in vdev_raid_row_alloc().
*/
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
"offset=%llx", "offset=%llx",
@ -2979,6 +3008,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
/* Check for success */ /* Check for success */
if (raidz_checksum_verify(zio) == 0) { if (raidz_checksum_verify(zio) == 0) {
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
return (0);
/* Reconstruction succeeded - report errors */ /* Reconstruction succeeded - report errors */
for (int i = 0; i < rm->rm_nrows; i++) { for (int i = 0; i < rm->rm_nrows; i++) {
@ -3379,7 +3410,6 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
zio_bad_cksum_t zbc; zio_bad_cksum_t zbc;
zbc.zbc_has_cksum = 0; zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected; zbc.zbc_injected = rm->rm_ecksuminjected;
mutex_enter(&cvd->vdev_stat_lock); mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++; cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock); mutex_exit(&cvd->vdev_stat_lock);
@ -3444,6 +3474,9 @@ vdev_raidz_io_done(zio_t *zio)
} }
if (raidz_checksum_verify(zio) == 0) { if (raidz_checksum_verify(zio) == 0) {
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
goto done;
for (int i = 0; i < rm->rm_nrows; i++) { for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i]; raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr); vdev_raidz_io_done_verified(zio, rr);
@ -3538,6 +3571,7 @@ vdev_raidz_io_done(zio_t *zio)
} }
} }
} }
done:
if (rm->rm_lr != NULL) { if (rm->rm_lr != NULL) {
zfs_rangelock_exit(rm->rm_lr); zfs_rangelock_exit(rm->rm_lr);
rm->rm_lr = NULL; rm->rm_lr = NULL;

View File

@ -303,6 +303,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
(void) cr; (void) cr;
int error = 0; int error = 0;
boolean_t frsync = B_FALSE; boolean_t frsync = B_FALSE;
boolean_t dio_checksum_failure = B_FALSE;
zfsvfs_t *zfsvfs = ZTOZSB(zp); zfsvfs_t *zfsvfs = ZTOZSB(zp);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
@ -424,8 +425,26 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if (error) { if (error) {
/* convert checksum errors into IO errors */ /* convert checksum errors into IO errors */
if (error == ECKSUM) if (error == ECKSUM) {
error = SET_ERROR(EIO); /*
* If a Direct I/O read returned a checksum
* verify error, then it must be treated as
* suspicious. The contents of the buffer could
* have beeen manipulated while the I/O was in
* flight. In this case, the remainder of I/O
* request will just be reissued through the
* ARC.
*/
if (uio->uio_extflg & UIO_DIRECT) {
dio_checksum_failure = B_TRUE;
uio->uio_extflg &= ~UIO_DIRECT;
n += dio_remaining_resid;
dio_remaining_resid = 0;
continue;
} else {
error = SET_ERROR(EIO);
}
}
#if defined(__linux__) #if defined(__linux__)
/* /*
@ -472,6 +491,9 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
out: out:
zfs_rangelock_exit(lr); zfs_rangelock_exit(lr);
if (dio_checksum_failure == B_TRUE)
uio->uio_extflg |= UIO_DIRECT;
/* /*
* Cleanup for Direct I/O if requested. * Cleanup for Direct I/O if requested.
*/ */

View File

@ -804,11 +804,11 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
pio->io_reexecute |= zio->io_reexecute; pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0); ASSERT3U(*countp, >, 0);
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { /*
ASSERT3U(*errorp, ==, EIO); * Propogate the Direct I/O checksum verify failure to the parent.
ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); */
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
}
(*countp)--; (*countp)--;
@ -1573,6 +1573,14 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/ */
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
/*
* We never allow the mirror VDEV to attempt reading from any
* additional data copies after the first Direct I/O checksum
* verify failure. This is to avoid bad data being written out
* through the mirror during self healing. See comment in
* vdev_mirror_io_done() for more details.
*/
ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
} else if (type == ZIO_TYPE_WRITE && } else if (type == ZIO_TYPE_WRITE &&
pio->io_prop.zp_direct_write == B_TRUE) { pio->io_prop.zp_direct_write == B_TRUE) {
/* /*
@ -4555,18 +4563,18 @@ zio_vdev_io_assess(zio_t *zio)
} }
/* /*
* If a Direct I/O write checksum verify error has occurred then this * If a Direct I/O operation has a checksum verify error then this I/O
* I/O should not attempt to be issued again. Instead the EIO will * should not attempt to be issued again.
* be returned.
*/ */
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT3U(zio->io_error, ==, EIO); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
ASSERT3U(zio->io_error, ==, EIO);
}
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
return (zio); return (zio);
} }
if (zio_injection_enabled && zio->io_error == 0) if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO); zio->io_error = zio_handle_fault_injection(zio, EIO);
@ -4864,16 +4872,40 @@ zio_checksum_verify(zio_t *zio)
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
} }
ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
!(zio->io_flags & ZIO_FLAG_SPECULATIVE));
if ((error = zio_checksum_error(zio, &info)) != 0) { if ((error = zio_checksum_error(zio, &info)) != 0) {
zio->io_error = error; zio->io_error = error;
if (error == ECKSUM && if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_flags & ZIO_FLAG_DIO_READ) {
zio->io_vd->vdev_stat.vs_checksum_errors++; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
mutex_exit(&zio->io_vd->vdev_stat_lock); zio_t *pio = zio_unique_parent(zio);
(void) zfs_ereport_start_checksum(zio->io_spa, /*
zio->io_vd, &zio->io_bookmark, zio, * Any Direct I/O read that has a checksum
zio->io_offset, zio->io_size, &info); * error must be treated as suspicous as the
* contents of the buffer could be getting
* manipulated while the I/O is taking place.
*
* The checksum verify error will only be
* reported here for disk and file VDEV's and
* will be reported on those that the failure
* occurred on. Other types of VDEV's report the
* verify failure in their own code paths.
*/
if (pio->io_child_type == ZIO_CHILD_LOGICAL) {
zio_dio_chksum_verify_error_report(zio);
}
} else {
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
(void) zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
zio->io_offset, zio->io_size, &info);
}
} }
} }
@ -4899,22 +4931,8 @@ zio_dio_checksum_verify(zio_t *zio)
if ((error = zio_checksum_error(zio, NULL)) != 0) { if ((error = zio_checksum_error(zio, NULL)) != 0) {
zio->io_error = error; zio->io_error = error;
if (error == ECKSUM) { if (error == ECKSUM) {
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_dio_verify_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
zio->io_error = SET_ERROR(EIO);
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
/*
* The EIO error must be propagated up to the logical
* parent ZIO in zio_notify_parent() so it can be
* returned to dmu_write_abd().
*/
zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
zio->io_spa, zio->io_vd, &zio->io_bookmark,
zio, 0);
} }
} }
@ -4932,6 +4950,39 @@ zio_checksum_verified(zio_t *zio)
zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
} }
/*
* Report Direct I/O checksum verify error and create ZED event.
*/
void
zio_dio_chksum_verify_error_report(zio_t *zio)
{
ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
return;
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_dio_verify_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
if (zio->io_type == ZIO_TYPE_WRITE) {
/*
* Convert checksum error for writes into EIO.
*/
zio->io_error = SET_ERROR(EIO);
/*
* Report dio_verify_wr ZED event.
*/
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
} else {
/*
* Report dio_verify_rd ZED event.
*/
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
}
}
/* /*
* ========================================================================== * ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
@ -5343,10 +5394,9 @@ zio_done(zio_t *zio)
if (zio->io_reexecute) { if (zio->io_reexecute) {
/* /*
* A Direct I/O write that has a checksum verify error should * A Direct I/O operation that has a checksum verify error
* not attempt to reexecute. Instead, EAGAIN should just be * should not attempt to reexecute. Instead, the error should
* propagated back up so the write can be attempt to be issued * just be propagated back.
* through the ARC.
*/ */
ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));

View File

@ -697,8 +697,8 @@ tags = ['functional', 'delegate']
tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block', 'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block',
'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites', 'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites',
'dio_property', 'dio_random', 'dio_recordsize', 'dio_unaligned_block', 'dio_property', 'dio_random', 'dio_read_verify', 'dio_recordsize',
'dio_unaligned_filesize'] 'dio_unaligned_block', 'dio_unaligned_filesize']
tags = ['functional', 'direct'] tags = ['functional', 'direct']
[tests/functional/exec] [tests/functional/exec]

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2022 by Triad National Security, LLC. * Copyright (c) 2024 by Triad National Security, LLC.
*/ */
#include <sys/types.h> #include <sys/types.h>
@ -39,51 +39,59 @@
#define MIN(a, b) ((a) < (b)) ? (a) : (b) #define MIN(a, b) ((a) < (b)) ? (a) : (b)
#endif #endif
static char *outputfile = NULL; static char *filename = NULL;
static int blocksize = 131072; /* 128K */ static int blocksize = 131072; /* 128K */
static int wr_err_expected = 0; static int err_expected = 0;
static int read_op = 0;
static int write_op = 0;
static int numblocks = 100; static int numblocks = 100;
static char *execname = NULL; static char *execname = NULL;
static int print_usage = 0; static int print_usage = 0;
static int randompattern = 0; static int randompattern = 0;
static int ofd; static int fd;
char *buf = NULL; char *buf = NULL;
typedef struct { typedef struct {
int entire_file_written; int entire_file_completed;
} pthread_args_t; } pthread_args_t;
static void static void
usage(void) usage(void)
{ {
(void) fprintf(stderr, (void) fprintf(stderr,
"usage %s -o outputfile [-b blocksize] [-e wr_error_expected]\n" "usage %s -f filename [-b blocksize] [-e wr_error_expected]\n"
" [-n numblocks] [-p randpattern] [-h help]\n" " [-n numblocks] [-p randompattern] -r read_op \n"
" -w write_op [-h help]\n"
"\n" "\n"
"Testing whether checksum verify works correctly for O_DIRECT.\n" "Testing whether checksum verify works correctly for O_DIRECT.\n"
"when manipulating the contents of a userspace buffer.\n" "when manipulating the contents of a userspace buffer.\n"
"\n" "\n"
" outputfile: File to write to.\n" " filename: File to read or write to.\n"
" blocksize: Size of each block to write (must be at \n" " blocksize: Size of each block to write (must be at \n"
" least >= 512).\n" " least >= 512).\n"
" wr_err_expected: Whether pwrite() is expected to return EIO\n" " err_expected: Whether write() is expected to return EIO\n"
" while manipulating the contents of the\n" " while manipulating the contents of the\n"
" buffer.\n" " buffer.\n"
" numblocks: Total number of blocksized blocks to\n" " numblocks: Total number of blocksized blocks to\n"
" write.\n" " write.\n"
" randpattern: Fill data buffer with random data. Default\n" " read_op: Perform reads to the filename file while\n"
" behavior is to fill the buffer with the \n" " while manipulating the buffer contents\n"
" known data pattern (0xdeadbeef).\n" " write_op: Perform writes to the filename file while\n"
" manipulating the buffer contents\n"
" randompattern: Fill data buffer with random data for \n"
" writes. Default behavior is to fill the \n"
" buffer with known data pattern (0xdeadbeef)\n"
" help: Print usage information and exit.\n" " help: Print usage information and exit.\n"
"\n" "\n"
" Required parameters:\n" " Required parameters:\n"
" outputfile\n" " filename\n"
" read_op or write_op\n"
"\n" "\n"
" Default Values:\n" " Default Values:\n"
" blocksize -> 131072\n" " blocksize -> 131072\n"
" wr_err_expexted -> false\n" " wr_err_expexted -> false\n"
" numblocks -> 100\n" " numblocks -> 100\n"
" randpattern -> false\n", " randompattern -> false\n",
execname); execname);
(void) exit(1); (void) exit(1);
} }
@ -97,16 +105,21 @@ parse_options(int argc, char *argv[])
extern int optind, optopt; extern int optind, optopt;
execname = argv[0]; execname = argv[0];
while ((c = getopt(argc, argv, "b:ehn:o:p")) != -1) { while ((c = getopt(argc, argv, "b:ef:hn:rw")) != -1) {
switch (c) { switch (c) {
case 'b': case 'b':
blocksize = atoi(optarg); blocksize = atoi(optarg);
break; break;
case 'e': case 'e':
wr_err_expected = 1; err_expected = 1;
break; break;
case 'f':
filename = optarg;
break;
case 'h': case 'h':
print_usage = 1; print_usage = 1;
break; break;
@ -115,12 +128,12 @@ parse_options(int argc, char *argv[])
numblocks = atoi(optarg); numblocks = atoi(optarg);
break; break;
case 'o': case 'r':
outputfile = optarg; read_op = 1;
break; break;
case 'p': case 'w':
randompattern = 1; write_op = 1;
break; break;
case ':': case ':':
@ -141,7 +154,8 @@ parse_options(int argc, char *argv[])
if (errflag || print_usage == 1) if (errflag || print_usage == 1)
(void) usage(); (void) usage();
if (blocksize < 512 || outputfile == NULL || numblocks <= 0) { if (blocksize < 512 || filename == NULL || numblocks <= 0 ||
(read_op == 0 && write_op == 0)) {
(void) fprintf(stderr, (void) fprintf(stderr,
"Required paramater(s) missing or invalid.\n"); "Required paramater(s) missing or invalid.\n");
(void) usage(); (void) usage();
@ -160,10 +174,10 @@ write_thread(void *arg)
ssize_t wrote = 0; ssize_t wrote = 0;
pthread_args_t *args = (pthread_args_t *)arg; pthread_args_t *args = (pthread_args_t *)arg;
while (!args->entire_file_written) { while (!args->entire_file_completed) {
wrote = pwrite(ofd, buf, blocksize, offset); wrote = pwrite(fd, buf, blocksize, offset);
if (wrote != blocksize) { if (wrote != blocksize) {
if (wr_err_expected) if (err_expected)
assert(errno == EIO); assert(errno == EIO);
else else
exit(2); exit(2);
@ -173,7 +187,35 @@ write_thread(void *arg)
left -= blocksize; left -= blocksize;
if (left == 0) if (left == 0)
args->entire_file_written = 1; args->entire_file_completed = 1;
}
pthread_exit(NULL);
}
/*
* Read blocksize * numblocks to the file using O_DIRECT.
*/
static void *
read_thread(void *arg)
{
size_t offset = 0;
int total_data = blocksize * numblocks;
int left = total_data;
ssize_t read = 0;
pthread_args_t *args = (pthread_args_t *)arg;
while (!args->entire_file_completed) {
read = pread(fd, buf, blocksize, offset);
if (read != blocksize) {
exit(2);
}
offset = ((offset + blocksize) % total_data);
left -= blocksize;
if (left == 0)
args->entire_file_completed = 1;
} }
pthread_exit(NULL); pthread_exit(NULL);
@ -189,7 +231,7 @@ manipulate_buf_thread(void *arg)
char rand_char; char rand_char;
pthread_args_t *args = (pthread_args_t *)arg; pthread_args_t *args = (pthread_args_t *)arg;
while (!args->entire_file_written) { while (!args->entire_file_completed) {
rand_offset = (rand() % blocksize); rand_offset = (rand() % blocksize);
rand_char = (rand() % (126 - 33) + 33); rand_char = (rand() % (126 - 33) + 33);
buf[rand_offset] = rand_char; buf[rand_offset] = rand_char;
@ -202,9 +244,9 @@ int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
const char *datapattern = "0xdeadbeef"; const char *datapattern = "0xdeadbeef";
int ofd_flags = O_WRONLY | O_CREAT | O_DIRECT; int fd_flags = O_DIRECT;
mode_t mode = S_IRUSR | S_IWUSR; mode_t mode = S_IRUSR | S_IWUSR;
pthread_t write_thr; pthread_t io_thr;
pthread_t manipul_thr; pthread_t manipul_thr;
int left = blocksize; int left = blocksize;
int offset = 0; int offset = 0;
@ -213,9 +255,15 @@ main(int argc, char *argv[])
parse_options(argc, argv); parse_options(argc, argv);
ofd = open(outputfile, ofd_flags, mode); if (write_op) {
if (ofd == -1) { fd_flags |= (O_WRONLY | O_CREAT);
(void) fprintf(stderr, "%s, %s\n", execname, outputfile); } else {
fd_flags |= O_RDONLY;
}
fd = open(filename, fd_flags, mode);
if (fd == -1) {
(void) fprintf(stderr, "%s, %s\n", execname, filename);
perror("open"); perror("open");
exit(2); exit(2);
} }
@ -228,24 +276,22 @@ main(int argc, char *argv[])
exit(2); exit(2);
} }
if (!randompattern) { if (write_op) {
/* Putting known data pattern in buffer */ if (!randompattern) {
while (left) { /* Putting known data pattern in buffer */
size_t amt = MIN(strlen(datapattern), left); while (left) {
memcpy(&buf[offset], datapattern, amt); size_t amt = MIN(strlen(datapattern), left);
offset += amt; memcpy(&buf[offset], datapattern, amt);
left -= amt; offset += amt;
left -= amt;
}
} else {
/* Putting random data in buffer */
for (int i = 0; i < blocksize; i++)
buf[i] = rand();
} }
} else {
/* Putting random data in buffer */
for (int i = 0; i < blocksize; i++)
buf[i] = rand();
} }
/*
* Writing using O_DIRECT while manipulating the buffer contents until
* the entire file is written.
*/
if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread, if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread,
&args))) { &args))) {
fprintf(stderr, "error: pthreads_create, manipul_thr, " fprintf(stderr, "error: pthreads_create, manipul_thr, "
@ -253,18 +299,34 @@ main(int argc, char *argv[])
exit(2); exit(2);
} }
if ((rc = pthread_create(&write_thr, NULL, write_thread, &args))) { if (write_op) {
fprintf(stderr, "error: pthreads_create, write_thr, " /*
"rc: %d\n", rc); * Writing using O_DIRECT while manipulating the buffer contents
exit(2); * until the entire file is written.
*/
if ((rc = pthread_create(&io_thr, NULL, write_thread, &args))) {
fprintf(stderr, "error: pthreads_create, io_thr, "
"rc: %d\n", rc);
exit(2);
}
} else {
/*
* Reading using O_DIRECT while manipulating the buffer contents
* until the entire file is read.
*/
if ((rc = pthread_create(&io_thr, NULL, read_thread, &args))) {
fprintf(stderr, "error: pthreads_create, io_thr, "
"rc: %d\n", rc);
exit(2);
}
} }
pthread_join(write_thr, NULL); pthread_join(io_thr, NULL);
pthread_join(manipul_thr, NULL); pthread_join(manipul_thr, NULL);
assert(args.entire_file_written == 1); assert(args.entire_file_completed == 1);
(void) close(ofd); (void) close(fd);
free(buf); free(buf);

View File

@ -1477,6 +1477,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/direct/dio_overwrites.ksh \ functional/direct/dio_overwrites.ksh \
functional/direct/dio_property.ksh \ functional/direct/dio_property.ksh \
functional/direct/dio_random.ksh \ functional/direct/dio_random.ksh \
functional/direct/dio_read_verify.ksh \
functional/direct/dio_recordsize.ksh \ functional/direct/dio_recordsize.ksh \
functional/direct/dio_unaligned_block.ksh \ functional/direct/dio_unaligned_block.ksh \
functional/direct/dio_unaligned_filesize.ksh \ functional/direct/dio_unaligned_filesize.ksh \

View File

@ -84,8 +84,9 @@ function get_zpool_status_chksum_verify_failures # pool_name vdev_type
function get_zed_dio_verify_events # pool function get_zed_dio_verify_events # pool
{ {
typeset pool=$1 typeset pool=$1
typeset op=$2
val=$(zpool events $pool | grep -c dio_verify) val=$(zpool events $pool | grep -c "dio_verify_${op}")
echo "$val" echo "$val"
} }
@ -96,11 +97,12 @@ function get_zed_dio_verify_events # pool
# zpool events # zpool events
# After getting that counts will clear the out the ZPool errors and events # After getting that counts will clear the out the ZPool errors and events
# #
function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors function check_dio_chksum_verify_failures # pool vdev_type op expect_errors
{ {
typeset pool=$1 typeset pool=$1
typeset vdev_type=$2 typeset vdev_type=$2
typeset expect_errors=$3 typeset expect_errors=$3
typeset op=$4
typeset note_str="expecting none" typeset note_str="expecting none"
if [[ $expect_errors -ne 0 ]]; then if [[ $expect_errors -ne 0 ]]; then
@ -108,10 +110,10 @@ function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors
fi fi
log_note "Checking for Direct I/O write checksum verify errors \ log_note "Checking for Direct I/O write checksum verify errors \
$note_str on ZPool: $pool" $note_str on ZPool: $pool with $vdev_type"
status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type) status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type)
zed_dio_verify_events=$(get_zed_dio_verify_events $pool) zed_dio_verify_events=$(get_zed_dio_verify_events $pool $op)
if [[ $expect_errors -ne 0 ]]; then if [[ $expect_errors -ne 0 ]]; then
if [[ $status_failures -eq 0 || if [[ $status_failures -eq 0 ||

View File

@ -0,0 +1,107 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2024 by Triad National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/direct/dio.cfg
. $STF_SUITE/tests/functional/direct/dio.kshlib
#
# DESCRIPTION:
# Verify checksum verify works for Direct I/O reads.
#
# STRATEGY:
# 1. Create a zpool from each vdev type.
# 2. Start a Direct I/O read workload while manipulating the user buffer
# contents.
# 3. Verify there are Direct I/O read verify failures using
# zpool status -d and checking for zevents. We also make sure there
# are reported no data errors.
#
verify_runnable "global"
log_assert "Verify checksum verify works for Direct I/O reads."
log_onexit dio_cleanup
NUMBLOCKS=300
BS=$((128 * 1024)) # 128k
log_must truncate -s $MINVDEVSIZE $DIO_VDEVS
# We will verify that there are no checksum errors for every Direct I/O read
# while manipulating the buffer contents while the I/O is still in flight and
# also that Direct I/O checksum verify failures and dio_verify_rd zevents are
# reported.
for type in "" "mirror" "raidz" "draid"; do
typeset vdev_type=$type
if [[ "${vdev_type}" == "" ]]; then
vdev_type="stripe"
fi
log_note "Verifying every Direct I/O read verify with VDEV type \
${vdev_type}"
create_pool $TESTPOOL1 $type $DIO_VDEVS
log_must eval "zfs create -o recordsize=128k -o compression=off \
$TESTPOOL1/$TESTFS1"
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1)
prev_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count)
prev_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count)
# Create the file before trying to manipulate the contents
log_must stride_dd -o "$mntpnt/direct-write.iso" -i /dev/urandom \
-b $BS -c $NUMBLOCKS -D
# Manipulate the buffer contents will reading the file with Direct I/O
log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \
-n $NUMBLOCKS -b $BS -r
# Getting new Direct I/O and ARC Write counts.
curr_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count)
curr_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count)
total_dio_rd=$((curr_dio_rd - prev_dio_rd))
total_arc_rd=$((curr_arc_rd - prev_arc_rd))
log_note "Making sure there are no checksum errors with the ZPool"
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_note "Making sure we have Direct I/O and ARC reads logged"
if [[ $total_dio_rd -lt 1 ]]; then
log_fail "No Direct I/O reads $total_dio_rd"
fi
if [[ $total_arc_rd -lt 1 ]]; then
log_fail "No ARC reads $total_arc_rd"
fi
log_note "Making sure we have Direct I/O write checksum verifies with ZPool"
check_dio_chksum_verify_failures "$TESTPOOL1" "$vdev_type" 1 "rd"
destroy_pool $TESTPOOL1
done
log_pass "Verified checksum verify works for Direct I/O reads."

View File

@ -46,7 +46,7 @@ verify_runnable "global"
function cleanup function cleanup
{ {
log_must rm -f "$mntpnt/direct-write.iso" log_must rm -f "$mntpnt/direct-write.iso"
check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 check_dio_chksum_verify_failures $TESTPOOL "raidz" 0 "wr"
} }
log_assert "Verify stable pages work for Direct I/O writes." log_assert "Verify stable pages work for Direct I/O writes."
@ -76,8 +76,8 @@ do
# Manipulate the user's buffer while running O_DIRECT write # Manipulate the user's buffer while running O_DIRECT write
# workload with the buffer. # workload with the buffer.
log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \
-n $NUMBLOCKS -b $BS -n $NUMBLOCKS -b $BS -w
# Reading back the contents of the file # Reading back the contents of the file
log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \

View File

@ -91,8 +91,8 @@ log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 0
log_note "Verifying no panics for Direct I/O writes with compression" log_note "Verifying no panics for Direct I/O writes with compression"
log_must zfs set compression=on $TESTPOOL/$TESTFS log_must zfs set compression=on $TESTPOOL/$TESTFS
prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count)
log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" -n $NUMBLOCKS \
-b $BS -b $BS -w
curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count)
total_dio_wr=$((curr_dio_wr - prev_dio_wr)) total_dio_wr=$((curr_dio_wr - prev_dio_wr))
@ -116,8 +116,8 @@ for i in $(seq 1 $ITERATIONS); do
$i of $ITERATIONS with zfs_vdev_direct_write_verify=0" $i of $ITERATIONS with zfs_vdev_direct_write_verify=0"
prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count)
log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \
-n $NUMBLOCKS -b $BS -n $NUMBLOCKS -b $BS -w
# Reading file back to verify checksum errors # Reading file back to verify checksum errors
filesize=$(get_file_size "$mntpnt/direct-write.iso") filesize=$(get_file_size "$mntpnt/direct-write.iso")
@ -144,7 +144,7 @@ for i in $(seq 1 $ITERATIONS); do
fi fi
log_note "Making sure we have no Direct I/O write checksum verifies \ log_note "Making sure we have no Direct I/O write checksum verifies \
with ZPool" with ZPool"
check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 check_dio_chksum_verify_failures $TESTPOOL "raidz" 0 "wr"
log_must rm -f "$mntpnt/direct-write.iso" log_must rm -f "$mntpnt/direct-write.iso"
done done
@ -166,8 +166,8 @@ for i in $(seq 1 $ITERATIONS); do
$ITERATIONS with zfs_vdev_direct_write_verify=1" $ITERATIONS with zfs_vdev_direct_write_verify=1"
prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count)
log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \
-n $NUMBLOCKS -b $BS -e -n $NUMBLOCKS -b $BS -e -w
# Reading file back to verify there no are checksum errors # Reading file back to verify there no are checksum errors
filesize=$(get_file_size "$mntpnt/direct-write.iso") filesize=$(get_file_size "$mntpnt/direct-write.iso")
@ -175,7 +175,7 @@ for i in $(seq 1 $ITERATIONS); do
log_must stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ log_must stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \
-c $num_blocks -c $num_blocks
# Getting new Direct I/O and ARC Write counts. # Getting new Direct I/O write counts.
curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count)
total_dio_wr=$((curr_dio_wr - prev_dio_wr)) total_dio_wr=$((curr_dio_wr - prev_dio_wr))
@ -188,7 +188,7 @@ for i in $(seq 1 $ITERATIONS); do
fi fi
log_note "Making sure we have Direct I/O write checksum verifies with ZPool" log_note "Making sure we have Direct I/O write checksum verifies with ZPool"
check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 check_dio_chksum_verify_failures "$TESTPOOL" "raidz" 1 "wr"
done done
log_must rm -f "$mntpnt/direct-write.iso" log_must rm -f "$mntpnt/direct-write.iso"