mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-01-27 10:24:22 +03:00
b4e4cbeb20
This fixes an oversight in the Direct I/O PR. There is nothing that stops a process from manipulating the contents of a buffer for a Direct I/O read while the I/O is in flight. This can lead checksum verify failures. However, the disk contents are still correct, and this would lead to false reporting of checksum validation failures. To remedy this, all Direct I/O reads that have a checksum verification failure are treated as suspicious. In the event a checksum validation failure occurs for a Direct I/O read, then the I/O request will be reissued though the ARC. This allows for actual validation to happen and removes any possibility of the buffer being manipulated after the I/O has been issued. Just as with Direct I/O write checksum validation failures, Direct I/O read checksum validation failures are reported though zpool status -d in the DIO column. Also the zevent has been updated to have both: 1. dio_verify_wr -> Checksum verification failure for writes 2. dio_verify_rd -> Checksum verification failure for reads. This allows for determining what I/O operation was the culprit for the checksum verification failure. All DIO errors are reported only on the top-level VDEV. Even though FreeBSD can write protect pages (stable pages) it still has the same issue as Linux with Direct I/O reads. This commit updates the following: 1. Propogates checksum failures for reads all the way up to the top-level VDEV. 2. Reports errors through zpool status -d as DIO. 3. Has two zevents for checksum verify errors with Direct I/O. One for read and one for write. 4. Updates FreeBSD ABD code to also check for ABD_FLAG_FROM_PAGES and handle ABD buffer contents validation the same as Linux. 5. Updated manipulate_user_buffer.c to also manipulate a buffer while a Direct I/O read is taking place. 6. Adds a new ZTS test case dio_read_verify that stress tests the new code. 7. Updated man pages. 8. Added an IMPLY statement to zio_checksum_verify() to make sure that Direct I/O reads are not issued as speculative. 9. Removed self healing through mirror, raidz, and dRAID VDEVs for Direct I/O reads. This issue was first observed when installing a Windows 11 VM on a ZFS dataset with the dataset property direct set to always. The zpool devices would report checksum failures, but running a subsequent zpool scrub would not repair any data and report no errors. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Atkinson <batkinson@lanl.gov> Closes #16598
281 lines
7.2 KiB
C
281 lines
7.2 KiB
C
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2024, Klara Inc.
|
|
*/
|
|
|
|
#include <sys/fs/zfs.h>
|
|
#include <sys/types.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/string.h>
|
|
#include <sys/debug.h>
|
|
#include "zfs_valstr.h"
|
|
|
|
/*
|
|
* Each bit in a bitfield has three possible string representations:
|
|
* - single char
|
|
* - two-char pair
|
|
* - full name
|
|
*/
|
|
typedef struct {
|
|
const char vb_bit;
|
|
const char vb_pair[2];
|
|
const char *vb_name;
|
|
} valstr_bit_t;
|
|
|
|
/*
|
|
* Emits a character for each bit in `bits`, up to the number of elements
|
|
* in the table. Set bits get the character in vb_bit, clear bits get a
|
|
* space. This results in all strings having the same width, for easier
|
|
* visual comparison.
|
|
*/
|
|
static size_t
|
|
valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
|
|
uint64_t bits, char *out, size_t outlen)
|
|
{
|
|
ASSERT(out);
|
|
size_t n = 0;
|
|
for (int b = 0; b < nelems; b++) {
|
|
if (n == outlen)
|
|
break;
|
|
uint64_t mask = (1ULL << b);
|
|
out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
|
|
}
|
|
if (n < outlen)
|
|
out[n++] = '\0';
|
|
return (n);
|
|
}
|
|
|
|
/*
|
|
* Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
|
|
* separated by a `|` character. This gives a concise representation of the
|
|
* whole value.
|
|
*/
|
|
static size_t
|
|
valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
|
|
uint64_t bits, char *out, size_t outlen)
|
|
{
|
|
ASSERT(out);
|
|
size_t n = 0;
|
|
for (int b = 0; b < nelems; b++) {
|
|
ASSERT3U(n, <=, outlen);
|
|
if (n == outlen)
|
|
break;
|
|
uint64_t mask = (1ULL << b);
|
|
if (bits & mask) {
|
|
size_t len = (n > 0) ? 3 : 2;
|
|
if (n > outlen-len)
|
|
break;
|
|
if (n > 0)
|
|
out[n++] = '|';
|
|
out[n++] = table[b].vb_pair[0];
|
|
out[n++] = table[b].vb_pair[1];
|
|
}
|
|
}
|
|
if (n < outlen)
|
|
out[n++] = '\0';
|
|
return (n);
|
|
}
|
|
|
|
/*
|
|
* Emits the full name for each bit set in `bits`, taken from vb_name, and
|
|
* separated by a space. This unambiguously shows the entire set of bits, but
|
|
* can get very long.
|
|
*/
|
|
static size_t
|
|
valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
|
|
uint64_t bits, char *out, size_t outlen)
|
|
{
|
|
ASSERT(out);
|
|
size_t n = 0;
|
|
for (int b = 0; b < nelems; b++) {
|
|
ASSERT3U(n, <=, outlen);
|
|
if (n == outlen)
|
|
break;
|
|
uint64_t mask = (1ULL << b);
|
|
if (bits & mask) {
|
|
size_t len = strlen(table[b].vb_name);
|
|
if (n > 0)
|
|
len++;
|
|
if (n > outlen-len)
|
|
break;
|
|
if (n > 0) {
|
|
out[n++] = ' ';
|
|
len--;
|
|
}
|
|
memcpy(&out[n], table[b].vb_name, len);
|
|
n += len;
|
|
}
|
|
}
|
|
if (n < outlen)
|
|
out[n++] = '\0';
|
|
return (n);
|
|
}
|
|
|
|
/*
|
|
* Emits the name of the given enum value in the table.
|
|
*/
|
|
static size_t
|
|
valstr_enum_str(const char **table, const size_t nelems,
|
|
int v, char *out, size_t outlen)
|
|
{
|
|
ASSERT(out);
|
|
ASSERT3U(v, <, nelems);
|
|
if (v >= nelems)
|
|
return (0);
|
|
return (MIN(strlcpy(out, table[v], outlen), outlen));
|
|
}
|
|
|
|
/*
|
|
* These macros create the string tables for the given name, and implement
|
|
* the public functions described in zfs_valstr.h.
|
|
*/
|
|
#define _VALSTR_BITFIELD_IMPL(name, ...) \
|
|
static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
|
|
size_t \
|
|
zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \
|
|
{ \
|
|
return (valstr_bitfield_bits(valstr_ ## name ## _table, \
|
|
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
|
|
} \
|
|
\
|
|
size_t \
|
|
zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \
|
|
{ \
|
|
return (valstr_bitfield_pairs(valstr_ ## name ## _table, \
|
|
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
|
|
} \
|
|
\
|
|
size_t \
|
|
zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \
|
|
{ \
|
|
return (valstr_bitfield_str(valstr_ ## name ## _table, \
|
|
ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \
|
|
} \
|
|
|
|
#define _VALSTR_ENUM_IMPL(name, ...) \
|
|
static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \
|
|
size_t \
|
|
zfs_valstr_ ## name(int v, char *out, size_t outlen) \
|
|
{ \
|
|
return (valstr_enum_str(valstr_ ## name ## _table, \
|
|
ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \
|
|
} \
|
|
|
|
|
|
/* String tables */
|
|
|
|
/* ZIO flags: zio_flag_t, typically zio->io_flags */
|
|
/* BEGIN CSTYLED */
|
|
_VALSTR_BITFIELD_IMPL(zio_flag,
|
|
{ '.', "DA", "DONT_AGGREGATE" },
|
|
{ '.', "RP", "IO_REPAIR" },
|
|
{ '.', "SH", "SELF_HEAL" },
|
|
{ '.', "RS", "RESILVER" },
|
|
{ '.', "SC", "SCRUB" },
|
|
{ '.', "ST", "SCAN_THREAD" },
|
|
{ '.', "PH", "PHYSICAL" },
|
|
{ '.', "CF", "CANFAIL" },
|
|
{ '.', "SP", "SPECULATIVE" },
|
|
{ '.', "CW", "CONFIG_WRITER" },
|
|
{ '.', "DR", "DONT_RETRY" },
|
|
{ '?', "??", "[UNUSED 11]" },
|
|
{ '.', "ND", "NODATA" },
|
|
{ '.', "ID", "INDUCE_DAMAGE" },
|
|
{ '.', "AL", "IO_ALLOCATING" },
|
|
{ '.', "RE", "IO_RETRY" },
|
|
{ '.', "PR", "PROBE" },
|
|
{ '.', "TH", "TRYHARD" },
|
|
{ '.', "OP", "OPTIONAL" },
|
|
{ '.', "RD", "DIO_READ" },
|
|
{ '.', "DQ", "DONT_QUEUE" },
|
|
{ '.', "DP", "DONT_PROPAGATE" },
|
|
{ '.', "BY", "IO_BYPASS" },
|
|
{ '.', "RW", "IO_REWRITE" },
|
|
{ '.', "CM", "RAW_COMPRESS" },
|
|
{ '.', "EN", "RAW_ENCRYPT" },
|
|
{ '.', "GG", "GANG_CHILD" },
|
|
{ '.', "DD", "DDT_CHILD" },
|
|
{ '.', "GF", "GODFATHER" },
|
|
{ '.', "NP", "NOPWRITE" },
|
|
{ '.', "EX", "REEXECUTED" },
|
|
{ '.', "DG", "DELEGATED" },
|
|
{ '.', "DC", "DIO_CHKSUM_ERR" },
|
|
)
|
|
/* END CSTYLED */
|
|
|
|
/*
|
|
* ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
|
|
* zio->io_pipeline.
|
|
*/
|
|
/* BEGIN CSTYLED */
|
|
_VALSTR_BITFIELD_IMPL(zio_stage,
|
|
{ 'O', "O ", "OPEN" },
|
|
{ 'I', "RI", "READ_BP_INIT" },
|
|
{ 'I', "WI", "WRITE_BP_INIT" },
|
|
{ 'I', "FI", "FREE_BP_INIT" },
|
|
{ 'A', "IA", "ISSUE_ASYNC" },
|
|
{ 'W', "WC", "WRITE_COMPRESS" },
|
|
{ 'E', "EN", "ENCRYPT" },
|
|
{ 'C', "CG", "CHECKSUM_GENERATE" },
|
|
{ 'N', "NW", "NOP_WRITE" },
|
|
{ 'B', "BF", "BRT_FREE" },
|
|
{ 'd', "dS", "DDT_READ_START" },
|
|
{ 'd', "dD", "DDT_READ_DONE" },
|
|
{ 'd', "dW", "DDT_WRITE" },
|
|
{ 'd', "dF", "DDT_FREE" },
|
|
{ 'G', "GA", "GANG_ASSEMBLE" },
|
|
{ 'G', "GI", "GANG_ISSUE" },
|
|
{ 'D', "DT", "DVA_THROTTLE" },
|
|
{ 'D', "DA", "DVA_ALLOCATE" },
|
|
{ 'D', "DF", "DVA_FREE" },
|
|
{ 'D', "DC", "DVA_CLAIM" },
|
|
{ 'R', "R ", "READY" },
|
|
{ 'V', "VS", "VDEV_IO_START" },
|
|
{ 'V', "VD", "VDEV_IO_DONE" },
|
|
{ 'V', "VA", "VDEV_IO_ASSESS" },
|
|
{ 'C', "CV", "CHECKSUM_VERIFY" },
|
|
{ 'C', "DC", "DIO_CHECKSUM_VERIFY" },
|
|
{ 'X', "X ", "DONE" },
|
|
)
|
|
/* END CSTYLED */
|
|
|
|
/* ZIO priority: zio_priority_t, typically zio->io_priority */
|
|
/* BEGIN CSTYLED */
|
|
_VALSTR_ENUM_IMPL(zio_priority,
|
|
"SYNC_READ",
|
|
"SYNC_WRITE",
|
|
"ASYNC_READ",
|
|
"ASYNC_WRITE",
|
|
"SCRUB",
|
|
"REMOVAL",
|
|
"INITIALIZING",
|
|
"TRIM",
|
|
"REBUILD",
|
|
"[NUM_QUEUEABLE]",
|
|
"NOW",
|
|
)
|
|
/* END CSTYLED */
|
|
|
|
#undef _VALSTR_BITFIELD_IMPL
|
|
#undef _VALSTR_ENUM_IMPL
|